diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-01-02 19:18:08 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-01-02 19:18:08 +0000 |
commit | bab175ec4b075c8076ba14c762900392533f6ee4 (patch) | |
tree | 01f4f29419a2cb10abe13c1e63cd2a66068b0137 /lib/Headers | |
parent | 8b7a8012d223fac5d17d16a66bb39168a9a1dfc0 (diff) | |
download | src-test2-bab175ec4b075c8076ba14c762900392533f6ee4.tar.gz src-test2-bab175ec4b075c8076ba14c762900392533f6ee4.zip |
Notes
Diffstat (limited to 'lib/Headers')
38 files changed, 13669 insertions, 5653 deletions
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt index fa2d2107781b..efc4dd0971b6 100644 --- a/lib/Headers/CMakeLists.txt +++ b/lib/Headers/CMakeLists.txt @@ -3,6 +3,7 @@ set(files altivec.h ammintrin.h arm_acle.h + armintr.h avx2intrin.h avx512bwintrin.h avx512cdintrin.h @@ -21,12 +22,13 @@ set(files avxintrin.h bmi2intrin.h bmiintrin.h + __clang_cuda_builtin_vars.h __clang_cuda_cmath.h + __clang_cuda_complex_builtins.h __clang_cuda_intrinsics.h __clang_cuda_math_forward_declares.h __clang_cuda_runtime_wrapper.h cpuid.h - cuda_builtin_vars.h clflushoptintrin.h emmintrin.h f16cintrin.h @@ -88,6 +90,12 @@ set(files xtestintrin.h ) +set(cuda_wrapper_files + cuda_wrappers/algorithm + cuda_wrappers/complex + cuda_wrappers/new +) + set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include) # Generate arm_neon.h @@ -95,7 +103,7 @@ clang_tablegen(arm_neon.h -gen-arm-neon SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td) set(out_files) -foreach( f ${files} ) +foreach( f ${files} ${cuda_wrapper_files} ) set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} ) set( dst ${output_dir}/${f} ) add_custom_command(OUTPUT ${dst} @@ -120,6 +128,12 @@ install( PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include) +install( + FILES ${cuda_wrapper_files} + COMPONENT clang-headers + PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ + DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include/cuda_wrappers) + if (NOT CMAKE_CONFIGURATION_TYPES) # don't add this for IDE's. add_custom_target(install-clang-headers DEPENDS clang-headers diff --git a/lib/Headers/cuda_builtin_vars.h b/lib/Headers/__clang_cuda_builtin_vars.h index 6f5eb9c78d85..6f5eb9c78d85 100644 --- a/lib/Headers/cuda_builtin_vars.h +++ b/lib/Headers/__clang_cuda_builtin_vars.h diff --git a/lib/Headers/__clang_cuda_cmath.h b/lib/Headers/__clang_cuda_cmath.h index ae7ff2f8d306..0eaa08b30cab 100644 --- a/lib/Headers/__clang_cuda_cmath.h +++ b/lib/Headers/__clang_cuda_cmath.h @@ -26,13 +26,15 @@ #error "This file is for CUDA compilation only." #endif +#include <limits> + // CUDA lets us use various std math functions on the device side. This file // works in concert with __clang_cuda_math_forward_declares.h to make this work. // // Specifically, the forward-declares header declares __device__ overloads for // these functions in the global namespace, then pulls them into namespace std // with 'using' statements. Then this file implements those functions, after -// the implementations have been pulled in. +// their implementations have been pulled in. // // It's important that we declare the functions in the global namespace and pull // them into namespace std with using statements, as opposed to simply declaring @@ -73,7 +75,10 @@ __DEVICE__ float frexp(float __arg, int *__exp) { __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); } __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); } __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); } -__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); } +// For inscrutable reasons, __finite(), the double-precision version of +// __finitef, does not exist when compiling for MacOS. __isfinited is available +// everywhere and is just as good. +__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); } __DEVICE__ bool isgreater(float __x, float __y) { return __builtin_isgreater(__x, __y); } @@ -120,12 +125,15 @@ __DEVICE__ float ldexp(float __arg, int __exp) { __DEVICE__ float log(float __x) { return ::logf(__x); } __DEVICE__ float log10(float __x) { return ::log10f(__x); } __DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); } -__DEVICE__ float nexttoward(float __from, float __to) { +__DEVICE__ float nexttoward(float __from, double __to) { return __builtin_nexttowardf(__from, __to); } __DEVICE__ double nexttoward(double __from, double __to) { return __builtin_nexttoward(__from, __to); } +__DEVICE__ float nexttowardf(float __from, double __to) { + return __builtin_nexttowardf(__from, __to); +} __DEVICE__ float pow(float __base, float __exp) { return ::powf(__base, __exp); } @@ -136,13 +144,338 @@ __DEVICE__ double pow(double __base, int __iexp) { return ::powi(__base, __iexp); } __DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); } -__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); } +__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); } __DEVICE__ float sin(float __x) { return ::sinf(__x); } __DEVICE__ float sinh(float __x) { return ::sinhf(__x); } __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); } __DEVICE__ float tan(float __x) { return ::tanf(__x); } __DEVICE__ float tanh(float __x) { return ::tanhf(__x); } +// Now we've defined everything we promised we'd define in +// __clang_cuda_math_forward_declares.h. We need to do two additional things to +// fix up our math functions. +// +// 1) Define __device__ overloads for e.g. sin(int). The CUDA headers define +// only sin(float) and sin(double), which means that e.g. sin(0) is +// ambiguous. +// +// 2) Pull the __device__ overloads of "foobarf" math functions into namespace +// std. These are defined in the CUDA headers in the global namespace, +// independent of everything else we've done here. + +// We can't use std::enable_if, because we want to be pre-C++11 compatible. But +// we go ahead and unconditionally define functions that are only available when +// compiling for C++11 to match the behavior of the CUDA headers. +template<bool __B, class __T = void> +struct __clang_cuda_enable_if {}; + +template <class __T> struct __clang_cuda_enable_if<true, __T> { + typedef __T type; +}; + +// Defines an overload of __fn that accepts one integral argument, calls +// __fn((double)x), and returns __retty. +#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn) \ + template <typename __T> \ + __DEVICE__ \ + typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, \ + __retty>::type \ + __fn(__T __x) { \ + return ::__fn((double)__x); \ + } + +// Defines an overload of __fn that accepts one two arithmetic arguments, calls +// __fn((double)x, (double)y), and returns a double. +// +// Note this is different from OVERLOAD_1, which generates an overload that +// accepts only *integral* arguments. +#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn) \ + template <typename __T1, typename __T2> \ + __DEVICE__ typename __clang_cuda_enable_if< \ + std::numeric_limits<__T1>::is_specialized && \ + std::numeric_limits<__T2>::is_specialized, \ + __retty>::type \ + __fn(__T1 __x, __T2 __y) { \ + return __fn((double)__x, (double)__y); \ + } + +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round); +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma) +__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc); + +#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1 +#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2 + +// Overloads for functions that don't match the patterns expected by +// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}. +template <typename __T1, typename __T2, typename __T3> +__DEVICE__ typename __clang_cuda_enable_if< + std::numeric_limits<__T1>::is_specialized && + std::numeric_limits<__T2>::is_specialized && + std::numeric_limits<__T3>::is_specialized, + double>::type +fma(__T1 __x, __T2 __y, __T3 __z) { + return std::fma((double)__x, (double)__y, (double)__z); +} + +template <typename __T> +__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, + double>::type +frexp(__T __x, int *__exp) { + return std::frexp((double)__x, __exp); +} + +template <typename __T> +__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, + double>::type +ldexp(__T __x, int __exp) { + return std::ldexp((double)__x, __exp); +} + +template <typename __T> +__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, + double>::type +nexttoward(__T __from, double __to) { + return std::nexttoward((double)__from, __to); +} + +template <typename __T1, typename __T2> +__DEVICE__ typename __clang_cuda_enable_if< + std::numeric_limits<__T1>::is_specialized && + std::numeric_limits<__T2>::is_specialized, + double>::type +remquo(__T1 __x, __T2 __y, int *__quo) { + return std::remquo((double)__x, (double)__y, __quo); +} + +template <typename __T> +__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, + double>::type +scalbln(__T __x, long __exp) { + return std::scalbln((double)__x, __exp); +} + +template <typename __T> +__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, + double>::type +scalbn(__T __x, int __exp) { + return std::scalbn((double)__x, __exp); +} + +// We need to define these overloads in exactly the namespace our standard +// library uses (including the right inline namespace), otherwise they won't be +// picked up by other functions in the standard library (e.g. functions in +// <complex>). Thus the ugliness below. +#ifdef _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_NAMESPACE_STD +#else +namespace std { +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_BEGIN_NAMESPACE_VERSION +#endif +#endif + +// Pull the new overloads we defined above into namespace std. +using ::acos; +using ::acosh; +using ::asin; +using ::asinh; +using ::atan; +using ::atan2; +using ::atanh; +using ::cbrt; +using ::ceil; +using ::copysign; +using ::cos; +using ::cosh; +using ::erf; +using ::erfc; +using ::exp; +using ::exp2; +using ::expm1; +using ::fabs; +using ::fdim; +using ::floor; +using ::fma; +using ::fmax; +using ::fmin; +using ::fmod; +using ::fpclassify; +using ::frexp; +using ::hypot; +using ::ilogb; +using ::isfinite; +using ::isgreater; +using ::isgreaterequal; +using ::isless; +using ::islessequal; +using ::islessgreater; +using ::isnormal; +using ::isunordered; +using ::ldexp; +using ::lgamma; +using ::llrint; +using ::llround; +using ::log; +using ::log10; +using ::log1p; +using ::log2; +using ::logb; +using ::lrint; +using ::lround; +using ::nearbyint; +using ::nextafter; +using ::nexttoward; +using ::pow; +using ::remainder; +using ::remquo; +using ::rint; +using ::round; +using ::scalbln; +using ::scalbn; +using ::signbit; +using ::sin; +using ::sinh; +using ::sqrt; +using ::tan; +using ::tanh; +using ::tgamma; +using ::trunc; + +// Well this is fun: We need to pull these symbols in for libc++, but we can't +// pull them in with libstdc++, because its ::isinf and ::isnan are different +// than its std::isinf and std::isnan. +#ifndef __GLIBCXX__ +using ::isinf; +using ::isnan; +#endif + +// Finally, pull the "foobarf" functions that CUDA defines in its headers into +// namespace std. +using ::acosf; +using ::acoshf; +using ::asinf; +using ::asinhf; +using ::atan2f; +using ::atanf; +using ::atanhf; +using ::cbrtf; +using ::ceilf; +using ::copysignf; +using ::cosf; +using ::coshf; +using ::erfcf; +using ::erff; +using ::exp2f; +using ::expf; +using ::expm1f; +using ::fabsf; +using ::fdimf; +using ::floorf; +using ::fmaf; +using ::fmaxf; +using ::fminf; +using ::fmodf; +using ::frexpf; +using ::hypotf; +using ::ilogbf; +using ::ldexpf; +using ::lgammaf; +using ::llrintf; +using ::llroundf; +using ::log10f; +using ::log1pf; +using ::log2f; +using ::logbf; +using ::logf; +using ::lrintf; +using ::lroundf; +using ::modff; +using ::nearbyintf; +using ::nextafterf; +using ::nexttowardf; +using ::nexttowardf; +using ::powf; +using ::remainderf; +using ::remquof; +using ::rintf; +using ::roundf; +using ::scalblnf; +using ::scalbnf; +using ::sinf; +using ::sinhf; +using ::sqrtf; +using ::tanf; +using ::tanhf; +using ::tgammaf; +using ::truncf; + +#ifdef _LIBCPP_END_NAMESPACE_STD +_LIBCPP_END_NAMESPACE_STD +#else +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_END_NAMESPACE_VERSION +#endif +} // namespace std +#endif + #undef __DEVICE__ #endif diff --git a/lib/Headers/__clang_cuda_complex_builtins.h b/lib/Headers/__clang_cuda_complex_builtins.h new file mode 100644 index 000000000000..beef7deff87f --- /dev/null +++ b/lib/Headers/__clang_cuda_complex_builtins.h @@ -0,0 +1,203 @@ +/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_COMPLEX_BUILTINS +#define __CLANG_CUDA_COMPLEX_BUILTINS + +// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are +// libgcc functions that clang assumes are available when compiling c99 complex +// operations. (These implementations come from libc++, and have been modified +// to work with CUDA.) + +extern "C" inline __device__ double _Complex __muldc3(double __a, double __b, + double __c, double __d) { + double __ac = __a * __c; + double __bd = __b * __d; + double __ad = __a * __d; + double __bc = __b * __c; + double _Complex z; + __real__(z) = __ac - __bd; + __imag__(z) = __ad + __bc; + if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) { + int __recalc = 0; + if (std::isinf(__a) || std::isinf(__b)) { + __a = std::copysign(std::isinf(__a) ? 1 : 0, __a); + __b = std::copysign(std::isinf(__b) ? 1 : 0, __b); + if (std::isnan(__c)) + __c = std::copysign(0, __c); + if (std::isnan(__d)) + __d = std::copysign(0, __d); + __recalc = 1; + } + if (std::isinf(__c) || std::isinf(__d)) { + __c = std::copysign(std::isinf(__c) ? 1 : 0, __c); + __d = std::copysign(std::isinf(__d) ? 1 : 0, __d); + if (std::isnan(__a)) + __a = std::copysign(0, __a); + if (std::isnan(__b)) + __b = std::copysign(0, __b); + __recalc = 1; + } + if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) || + std::isinf(__ad) || std::isinf(__bc))) { + if (std::isnan(__a)) + __a = std::copysign(0, __a); + if (std::isnan(__b)) + __b = std::copysign(0, __b); + if (std::isnan(__c)) + __c = std::copysign(0, __c); + if (std::isnan(__d)) + __d = std::copysign(0, __d); + __recalc = 1; + } + if (__recalc) { + // Can't use std::numeric_limits<double>::infinity() -- that doesn't have + // a device overload (and isn't constexpr before C++11, naturally). + __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d); + __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c); + } + } + return z; +} + +extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b, + float __c, float __d) { + float __ac = __a * __c; + float __bd = __b * __d; + float __ad = __a * __d; + float __bc = __b * __c; + float _Complex z; + __real__(z) = __ac - __bd; + __imag__(z) = __ad + __bc; + if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) { + int __recalc = 0; + if (std::isinf(__a) || std::isinf(__b)) { + __a = std::copysign(std::isinf(__a) ? 1 : 0, __a); + __b = std::copysign(std::isinf(__b) ? 1 : 0, __b); + if (std::isnan(__c)) + __c = std::copysign(0, __c); + if (std::isnan(__d)) + __d = std::copysign(0, __d); + __recalc = 1; + } + if (std::isinf(__c) || std::isinf(__d)) { + __c = std::copysign(std::isinf(__c) ? 1 : 0, __c); + __d = std::copysign(std::isinf(__d) ? 1 : 0, __d); + if (std::isnan(__a)) + __a = std::copysign(0, __a); + if (std::isnan(__b)) + __b = std::copysign(0, __b); + __recalc = 1; + } + if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) || + std::isinf(__ad) || std::isinf(__bc))) { + if (std::isnan(__a)) + __a = std::copysign(0, __a); + if (std::isnan(__b)) + __b = std::copysign(0, __b); + if (std::isnan(__c)) + __c = std::copysign(0, __c); + if (std::isnan(__d)) + __d = std::copysign(0, __d); + __recalc = 1; + } + if (__recalc) { + __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d); + __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c); + } + } + return z; +} + +extern "C" inline __device__ double _Complex __divdc3(double __a, double __b, + double __c, double __d) { + int __ilogbw = 0; + // Can't use std::max, because that's defined in <algorithm>, and we don't + // want to pull that in for every compile. The CUDA headers define + // ::max(float, float) and ::max(double, double), which is sufficient for us. + double __logbw = std::logb(max(std::abs(__c), std::abs(__d))); + if (std::isfinite(__logbw)) { + __ilogbw = (int)__logbw; + __c = std::scalbn(__c, -__ilogbw); + __d = std::scalbn(__d, -__ilogbw); + } + double __denom = __c * __c + __d * __d; + double _Complex z; + __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw); + __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw); + if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) { + if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) { + __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a; + __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b; + } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) && + std::isfinite(__d)) { + __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a); + __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b); + __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d); + __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d); + } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) && + std::isfinite(__b)) { + __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c); + __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d); + __real__(z) = 0.0 * (__a * __c + __b * __d); + __imag__(z) = 0.0 * (__b * __c - __a * __d); + } + } + return z; +} + +extern "C" inline __device__ float _Complex __divsc3(float __a, float __b, + float __c, float __d) { + int __ilogbw = 0; + float __logbw = std::logb(max(std::abs(__c), std::abs(__d))); + if (std::isfinite(__logbw)) { + __ilogbw = (int)__logbw; + __c = std::scalbn(__c, -__ilogbw); + __d = std::scalbn(__d, -__ilogbw); + } + float __denom = __c * __c + __d * __d; + float _Complex z; + __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw); + __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw); + if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) { + if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) { + __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a; + __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b; + } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) && + std::isfinite(__d)) { + __a = std::copysign(std::isinf(__a) ? 1 : 0, __a); + __b = std::copysign(std::isinf(__b) ? 1 : 0, __b); + __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d); + __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d); + } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) && + std::isfinite(__b)) { + __c = std::copysign(std::isinf(__c) ? 1 : 0, __c); + __d = std::copysign(std::isinf(__d) ? 1 : 0, __d); + __real__(z) = 0 * (__a * __c + __b * __d); + __imag__(z) = 0 * (__b * __c - __a * __d); + } + } + return z; +} + +#endif // __CLANG_CUDA_COMPLEX_BUILTINS diff --git a/lib/Headers/__clang_cuda_math_forward_declares.h b/lib/Headers/__clang_cuda_math_forward_declares.h index 3f2834d95000..49c805151d65 100644 --- a/lib/Headers/__clang_cuda_math_forward_declares.h +++ b/lib/Headers/__clang_cuda_math_forward_declares.h @@ -140,6 +140,7 @@ __DEVICE__ long lrint(double); __DEVICE__ long lrint(float); __DEVICE__ long lround(double); __DEVICE__ long lround(float); +__DEVICE__ long long llround(float); // No llround(double). __DEVICE__ double modf(double, double *); __DEVICE__ float modf(float, float *); __DEVICE__ double nan(const char *); @@ -149,7 +150,8 @@ __DEVICE__ float nearbyint(float); __DEVICE__ double nextafter(double, double); __DEVICE__ float nextafter(float, float); __DEVICE__ double nexttoward(double, double); -__DEVICE__ float nexttoward(float, float); +__DEVICE__ float nexttoward(float, double); +__DEVICE__ float nexttowardf(float, double); __DEVICE__ double pow(double, double); __DEVICE__ double pow(double, int); __DEVICE__ float pow(float, float); @@ -183,7 +185,19 @@ __DEVICE__ float tgamma(float); __DEVICE__ double trunc(double); __DEVICE__ float trunc(float); +// We need to define these overloads in exactly the namespace our standard +// library uses (including the right inline namespace), otherwise they won't be +// picked up by other functions in the standard library (e.g. functions in +// <complex>). Thus the ugliness below. +#ifdef _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_NAMESPACE_STD +#else namespace std { +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_BEGIN_NAMESPACE_VERSION +#endif +#endif + using ::abs; using ::acos; using ::acosh; @@ -235,6 +249,7 @@ using ::log2; using ::logb; using ::lrint; using ::lround; +using ::llround; using ::modf; using ::nan; using ::nanf; @@ -256,7 +271,15 @@ using ::tan; using ::tanh; using ::tgamma; using ::trunc; + +#ifdef _LIBCPP_END_NAMESPACE_STD +_LIBCPP_END_NAMESPACE_STD +#else +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_END_NAMESPACE_VERSION +#endif } // namespace std +#endif #pragma pop_macro("__DEVICE__") diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h index 6445f9b76b8f..205e15b40b5d 100644 --- a/lib/Headers/__clang_cuda_runtime_wrapper.h +++ b/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -62,7 +62,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000 #error "Unsupported CUDA version!" #endif @@ -72,9 +72,9 @@ #define __CUDA_ARCH__ 350 #endif -#include "cuda_builtin_vars.h" +#include "__clang_cuda_builtin_vars.h" -// No need for device_launch_parameters.h as cuda_builtin_vars.h above +// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above // has taken care of builtin variables declared in the file. #define __DEVICE_LAUNCH_PARAMETERS_H__ @@ -113,6 +113,7 @@ #undef __cxa_vec_ctor #undef __cxa_vec_cctor #undef __cxa_vec_dtor +#undef __cxa_vec_new #undef __cxa_vec_new2 #undef __cxa_vec_new3 #undef __cxa_vec_delete2 @@ -120,6 +121,15 @@ #undef __cxa_vec_delete3 #undef __cxa_pure_virtual +// math_functions.hpp expects this host function be defined on MacOS, but it +// ends up not being there because of the games we play here. Just define it +// ourselves; it's simple enough. +#ifdef __APPLE__ +inline __host__ double __signbitd(double x) { + return std::signbit(x); +} +#endif + // We need decls for functions in CUDA's libdevice with __device__ // attribute only. Alas they come either as __host__ __device__ or // with no attributes at all. To work around that, define __CUDA_RTC__ @@ -135,6 +145,21 @@ // the headers we're about to include. #define __host__ UNEXPECTED_HOST_ATTRIBUTE +// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values. +// Previous versions used to check whether they are defined or not. +// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it +// here to detect the switch. + +#if defined(CU_DEVICE_INVALID) +#if !defined(__USE_FAST_MATH__) +#define __USE_FAST_MATH__ 0 +#endif + +#if !defined(__CUDA_PREC_DIV) +#define __CUDA_PREC_DIV 0 +#endif +#endif + // device_functions.hpp and math_functions*.hpp use 'static // __forceinline__' (with no __device__) for definitions of device // functions. Temporarily redefine __forceinline__ to include @@ -151,7 +176,7 @@ // slow divides), so we need to scope our define carefully here. #pragma push_macro("__USE_FAST_MATH__") #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__) -#define __USE_FAST_MATH__ +#define __USE_FAST_MATH__ 1 #endif #include "math_functions.hpp" #pragma pop_macro("__USE_FAST_MATH__") @@ -267,8 +292,8 @@ __device__ static inline void *malloc(size_t __size) { } } // namespace std -// Out-of-line implementations from cuda_builtin_vars.h. These need to come -// after we've pulled in the definition of uint3 and dim3. +// Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to +// come after we've pulled in the definition of uint3 and dim3. __device__ inline __cuda_builtin_threadIdx_t::operator uint3() const { uint3 ret; @@ -296,13 +321,14 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const { #include <__clang_cuda_cmath.h> #include <__clang_cuda_intrinsics.h> +#include <__clang_cuda_complex_builtins.h> // curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host // mode, giving them their "proper" types of dim3 and uint3. This is -// incompatible with the types we give in cuda_builtin_vars.h. As as hack, -// force-include the header (nvcc doesn't include it by default) but redefine -// dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are only -// used here for the redeclarations of blockDim and threadIdx.) +// incompatible with the types we give in __clang_cuda_builtin_vars.h. As as +// hack, force-include the header (nvcc doesn't include it by default) but +// redefine dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are +// only used here for the redeclarations of blockDim and threadIdx.) #pragma push_macro("dim3") #pragma push_macro("uint3") #define dim3 __cuda_builtin_blockDim_t diff --git a/lib/Headers/__wmmintrin_aes.h b/lib/Headers/__wmmintrin_aes.h index 211518eb2884..3a2ee1b2ef2e 100644 --- a/lib/Headers/__wmmintrin_aes.h +++ b/lib/Headers/__wmmintrin_aes.h @@ -35,7 +35,7 @@ /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VAESENC instruction. +/// This intrinsic corresponds to the <c> VAESENC </c> instruction. /// /// \param __V /// A 128-bit integer vector containing the state value. @@ -55,7 +55,7 @@ _mm_aesenc_si128(__m128i __V, __m128i __R) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VAESENCLAST instruction. +/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction. /// /// \param __V /// A 128-bit integer vector containing the state value. @@ -75,7 +75,7 @@ _mm_aesenclast_si128(__m128i __V, __m128i __R) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VAESDEC instruction. +/// This intrinsic corresponds to the <c> VAESDEC </c> instruction. /// /// \param __V /// A 128-bit integer vector containing the state value. @@ -95,7 +95,7 @@ _mm_aesdec_si128(__m128i __V, __m128i __R) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VAESDECLAST instruction. +/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction. /// /// \param __V /// A 128-bit integer vector containing the state value. @@ -114,7 +114,7 @@ _mm_aesdeclast_si128(__m128i __V, __m128i __R) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VAESIMC instruction. +/// This intrinsic corresponds to the <c> VAESIMC </c> instruction. /// /// \param __V /// A 128-bit integer vector containing the expanded key. @@ -136,7 +136,7 @@ _mm_aesimc_si128(__m128i __V) /// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R); /// \endcode /// -/// This intrinsic corresponds to the \c AESKEYGENASSIST instruction. +/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction. /// /// \param C /// A 128-bit integer vector that is used to generate the AES encryption key. diff --git a/lib/Headers/__wmmintrin_pclmul.h b/lib/Headers/__wmmintrin_pclmul.h index d4e073f40688..e9c6a9f6d415 100644 --- a/lib/Headers/__wmmintrin_pclmul.h +++ b/lib/Headers/__wmmintrin_pclmul.h @@ -34,7 +34,7 @@ /// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I); /// \endcode /// -/// This intrinsic corresponds to the \c VPCLMULQDQ instruction. +/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction. /// /// \param __X /// A 128-bit vector of [2 x i64] containing one of the source operands. @@ -42,13 +42,12 @@ /// A 128-bit vector of [2 x i64] containing one of the source operands. /// \param __I /// An immediate value specifying which 64-bit values to select from the -/// operands. -/// Bit 0 is used to select a value from operand __X, -/// and bit 4 is used to select a value from operand __Y: -/// Bit[0]=0 indicates that bits[63:0] of operand __X are used. -/// Bit[0]=1 indicates that bits[127:64] of operand __X are used. -/// Bit[4]=0 indicates that bits[63:0] of operand __Y are used. -/// Bit[4]=1 indicates that bits[127:64] of operand __Y are used. +/// operands. Bit 0 is used to select a value from operand \a __X, and bit +/// 4 is used to select a value from operand \a __Y: \n +/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n +/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n +/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n +/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used. /// \returns The 128-bit integer vector containing the result of the carry-less /// multiplication of the selected 64-bit values. #define _mm_clmulepi64_si128(__X, __Y, __I) \ diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h index 74a1914ce83b..d1d1d8026325 100644 --- a/lib/Headers/altivec.h +++ b/lib/Headers/altivec.h @@ -34,8 +34,31 @@ #define __CR6_LT 2 #define __CR6_LT_REV 3 +/* Constants for vec_test_data_class */ +#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0) +#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1) +#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \ + __VEC_CLASS_FP_SUBNORMAL_N) +#define __VEC_CLASS_FP_ZERO_N (1<<2) +#define __VEC_CLASS_FP_ZERO_P (1<<3) +#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P | \ + __VEC_CLASS_FP_ZERO_N) +#define __VEC_CLASS_FP_INFINITY_N (1<<4) +#define __VEC_CLASS_FP_INFINITY_P (1<<5) +#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P | \ + __VEC_CLASS_FP_INFINITY_N) +#define __VEC_CLASS_FP_NAN (1<<6) +#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN | \ + __VEC_CLASS_FP_SUBNORMAL | \ + __VEC_CLASS_FP_ZERO | \ + __VEC_CLASS_FP_INFINITY) + #define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__)) +#ifdef __POWER9_VECTOR__ +#include <stddef.h> +#endif + static __inline__ vector signed char __ATTRS_o_ai vec_perm( vector signed char __a, vector signed char __b, vector unsigned char __c); @@ -134,7 +157,7 @@ static __inline__ vector float __ATTRS_o_ai vec_abs(vector float __a) { #endif } -#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +#ifdef __VSX__ static __inline__ vector double __ATTRS_o_ai vec_abs(vector double __a) { return __builtin_vsx_xvabsdp(__a); } @@ -163,6 +186,26 @@ vec_abss(vector signed int __a) { __a, __builtin_altivec_vsubsws((vector signed int)(0), __a)); } +/* vec_absd */ +#if defined(__POWER9_VECTOR__) + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_absd(vector unsigned char __a, vector unsigned char __b) { + return __builtin_altivec_vabsdub(__a, __b); +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_absd(vector unsigned short __a, vector unsigned short __b) { + return __builtin_altivec_vabsduh(__a, __b); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_absd(vector unsigned int __a, vector unsigned int __b) { + return __builtin_altivec_vabsduw(__a, __b); +} + +#endif /* End __POWER9_VECTOR__ */ + /* vec_add */ static __inline__ vector signed char __ATTRS_o_ai @@ -305,6 +348,22 @@ vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b, } #endif +static __inline__ vector signed int __ATTRS_o_ai +vec_adde(vector signed int __a, vector signed int __b, + vector signed int __c) { + vector signed int __mask = {1, 1, 1, 1}; + vector signed int __carry = __c & __mask; + return vec_add(vec_add(__a, __b), __carry); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_adde(vector unsigned int __a, vector unsigned int __b, + vector unsigned int __c) { + vector unsigned int __mask = {1, 1, 1, 1}; + vector unsigned int __carry = __c & __mask; + return vec_add(vec_add(__a, __b), __carry); +} + /* vec_addec */ #if defined(__POWER8_VECTOR__) && defined(__powerpc64__) @@ -319,6 +378,50 @@ vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b, vector unsigned __int128 __c) { return __builtin_altivec_vaddecuq(__a, __b, __c); } + +static __inline__ vector signed int __ATTRS_o_ai +vec_addec(vector signed int __a, vector signed int __b, + vector signed int __c) { + + signed int __result[4]; + for (int i = 0; i < 4; i++) { + unsigned int __tempa = (unsigned int) __a[i]; + unsigned int __tempb = (unsigned int) __b[i]; + unsigned int __tempc = (unsigned int) __c[i]; + __tempc = __tempc & 0x00000001; + unsigned long long __longa = (unsigned long long) __tempa; + unsigned long long __longb = (unsigned long long) __tempb; + unsigned long long __longc = (unsigned long long) __tempc; + unsigned long long __sum = __longa + __longb + __longc; + unsigned long long __res = (__sum >> 32) & 0x01; + unsigned long long __tempres = (unsigned int) __res; + __result[i] = (signed int) __tempres; + } + + vector signed int ret = { __result[0], __result[1], __result[2], __result[3] }; + return ret; +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_addec(vector unsigned int __a, vector unsigned int __b, + vector unsigned int __c) { + + unsigned int __result[4]; + for (int i = 0; i < 4; i++) { + unsigned int __tempc = __c[i] & 1; + unsigned long long __longa = (unsigned long long) __a[i]; + unsigned long long __longb = (unsigned long long) __b[i]; + unsigned long long __longc = (unsigned long long) __tempc; + unsigned long long __sum = __longa + __longb + __longc; + unsigned long long __res = (__sum >> 32) & 0x01; + unsigned long long __tempres = (unsigned int) __res; + __result[i] = (signed int) __tempres; + } + + vector unsigned int ret = { __result[0], __result[1], __result[2], __result[3] }; + return ret; +} + #endif /* vec_vaddubm */ @@ -1544,6 +1647,12 @@ vec_cmpeq(vector unsigned char __a, vector unsigned char __b) { (vector char)__b); } +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpeq(vector bool char __a, vector bool char __b) { + return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a, + (vector char)__b); +} + static __inline__ vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a, vector short __b) { return (vector bool short)__builtin_altivec_vcmpequh(__a, __b); @@ -1555,6 +1664,12 @@ vec_cmpeq(vector unsigned short __a, vector unsigned short __b) { (vector short)__b); } +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpeq(vector bool short __a, vector bool short __b) { + return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a, + (vector short)__b); +} + static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a, vector int __b) { return (vector bool int)__builtin_altivec_vcmpequw(__a, __b); @@ -1566,6 +1681,12 @@ vec_cmpeq(vector unsigned int __a, vector unsigned int __b) { (vector int)__b); } +static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector bool int __a, + vector bool int __b) { + return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a, + (vector int)__b); +} + #ifdef __POWER8_VECTOR__ static __inline__ vector bool long long __ATTRS_o_ai vec_cmpeq(vector signed long long __a, vector signed long long __b) { @@ -1577,6 +1698,13 @@ vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) { return (vector bool long long)__builtin_altivec_vcmpequd( (vector long long)__a, (vector long long)__b); } + +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpeq(vector bool long long __a, vector bool long long __b) { + return (vector bool long long)__builtin_altivec_vcmpequd( + (vector long long)__a, (vector long long)__b); +} + #endif static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a, @@ -1595,6 +1723,199 @@ vec_cmpeq(vector double __a, vector double __b) { } #endif +#ifdef __POWER9_VECTOR__ +/* vec_cmpne */ + +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpne(vector bool char __a, vector bool char __b) { + return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a, + (vector char)__b); +} + +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpne(vector signed char __a, vector signed char __b) { + return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a, + (vector char)__b); +} + +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpne(vector unsigned char __a, vector unsigned char __b) { + return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a, + (vector char)__b); +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpne(vector bool short __a, vector bool short __b) { + return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a, + (vector short)__b); +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpne(vector signed short __a, vector signed short __b) { + return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a, + (vector short)__b); +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpne(vector unsigned short __a, vector unsigned short __b) { + return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a, + (vector short)__b); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpne(vector bool int __a, vector bool int __b) { + return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a, + (vector int)__b); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpne(vector signed int __a, vector signed int __b) { + return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a, + (vector int)__b); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpne(vector unsigned int __a, vector unsigned int __b) { + return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a, + (vector int)__b); +} + +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpne(vector bool long long __a, vector bool long long __b) { + return (vector bool long long) + ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +} + +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpne(vector signed long long __a, vector signed long long __b) { + return (vector bool long long) + ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +} + +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) { + return (vector bool long long) + ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpne(vector float __a, vector float __b) { + return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a, + (vector int)__b); +} + +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpne(vector double __a, vector double __b) { + return (vector bool long long) + ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +} + +/* vec_cmpnez */ + +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpnez(vector signed char __a, vector signed char __b) { + return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a, + (vector char)__b); +} + +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpnez(vector unsigned char __a, vector unsigned char __b) { + return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a, + (vector char)__b); +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpnez(vector signed short __a, vector signed short __b) { + return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a, + (vector short)__b); +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpnez(vector unsigned short __a, vector unsigned short __b) { + return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a, + (vector short)__b); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpnez(vector signed int __a, vector signed int __b) { + return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a, + (vector int)__b); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpnez(vector unsigned int __a, vector unsigned int __b) { + return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a, + (vector int)__b); +} + +static __inline__ signed int __ATTRS_o_ai +vec_cntlz_lsbb(vector signed char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vctzlsbb(__a); +#else + return __builtin_altivec_vclzlsbb(__a); +#endif +} + +static __inline__ signed int __ATTRS_o_ai +vec_cntlz_lsbb(vector unsigned char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vctzlsbb(__a); +#else + return __builtin_altivec_vclzlsbb(__a); +#endif +} + +static __inline__ signed int __ATTRS_o_ai +vec_cnttz_lsbb(vector signed char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vclzlsbb(__a); +#else + return __builtin_altivec_vctzlsbb(__a); +#endif +} + +static __inline__ signed int __ATTRS_o_ai +vec_cnttz_lsbb(vector unsigned char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vclzlsbb(__a); +#else + return __builtin_altivec_vctzlsbb(__a); +#endif +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_parity_lsbb(vector unsigned int __a) { + return __builtin_altivec_vprtybw(__a); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_parity_lsbb(vector signed int __a) { + return __builtin_altivec_vprtybw(__a); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_parity_lsbb(vector unsigned __int128 __a) { + return __builtin_altivec_vprtybq(__a); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_parity_lsbb(vector signed __int128 __a) { + return __builtin_altivec_vprtybq(__a); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_parity_lsbb(vector unsigned long long __a) { + return __builtin_altivec_vprtybd(__a); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_parity_lsbb(vector signed long long __a) { + return __builtin_altivec_vprtybd(__a); +} + +#endif + /* vec_cmpgt */ static __inline__ vector bool char __ATTRS_o_ai @@ -1882,6 +2203,41 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) { return vec_cmpgt(__b, __a); } +/* vec_popcnt */ + +static __inline__ vector signed char __ATTRS_o_ai +vec_popcnt(vector signed char __a) { + return __builtin_altivec_vpopcntb(__a); +} +static __inline__ vector unsigned char __ATTRS_o_ai +vec_popcnt(vector unsigned char __a) { + return __builtin_altivec_vpopcntb(__a); +} +static __inline__ vector signed short __ATTRS_o_ai +vec_popcnt(vector signed short __a) { + return __builtin_altivec_vpopcnth(__a); +} +static __inline__ vector unsigned short __ATTRS_o_ai +vec_popcnt(vector unsigned short __a) { + return __builtin_altivec_vpopcnth(__a); +} +static __inline__ vector signed int __ATTRS_o_ai +vec_popcnt(vector signed int __a) { + return __builtin_altivec_vpopcntw(__a); +} +static __inline__ vector unsigned int __ATTRS_o_ai +vec_popcnt(vector unsigned int __a) { + return __builtin_altivec_vpopcntw(__a); +} +static __inline__ vector signed long long __ATTRS_o_ai +vec_popcnt(vector signed long long __a) { + return __builtin_altivec_vpopcntd(__a); +} +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_popcnt(vector unsigned long long __a) { + return __builtin_altivec_vpopcntd(__a); +} + /* vec_cntlz */ static __inline__ vector signed char __ATTRS_o_ai @@ -1918,6 +2274,603 @@ vec_cntlz(vector unsigned long long __a) { } #endif +#ifdef __POWER9_VECTOR__ + +/* vec_cnttz */ + +static __inline__ vector signed char __ATTRS_o_ai +vec_cnttz(vector signed char __a) { + return __builtin_altivec_vctzb(__a); +} +static __inline__ vector unsigned char __ATTRS_o_ai +vec_cnttz(vector unsigned char __a) { + return __builtin_altivec_vctzb(__a); +} +static __inline__ vector signed short __ATTRS_o_ai +vec_cnttz(vector signed short __a) { + return __builtin_altivec_vctzh(__a); +} +static __inline__ vector unsigned short __ATTRS_o_ai +vec_cnttz(vector unsigned short __a) { + return __builtin_altivec_vctzh(__a); +} +static __inline__ vector signed int __ATTRS_o_ai +vec_cnttz(vector signed int __a) { + return __builtin_altivec_vctzw(__a); +} +static __inline__ vector unsigned int __ATTRS_o_ai +vec_cnttz(vector unsigned int __a) { + return __builtin_altivec_vctzw(__a); +} +static __inline__ vector signed long long __ATTRS_o_ai +vec_cnttz(vector signed long long __a) { + return __builtin_altivec_vctzd(__a); +} +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_cnttz(vector unsigned long long __a) { + return __builtin_altivec_vctzd(__a); +} + +/* vec_first_match_index */ + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_index(vector signed char __a, vector signed char __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 3; + } + return __res[0] >> 3; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_index(vector unsigned char __a, vector unsigned char __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 3; + } + return __res[0] >> 3; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_index(vector signed short __a, vector signed short __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 4; + } + return __res[0] >> 4; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_index(vector unsigned short __a, vector unsigned short __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 4; + } + return __res[0] >> 4; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_index(vector signed int __a, vector signed int __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 5; + } + return __res[0] >> 5; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_index(vector unsigned int __a, vector unsigned int __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 5; + } + return __res[0] >> 5; +} + +/* vec_first_match_or_eos_index */ + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_or_eos_index(vector signed char __a, vector signed char __b) { + /* Compare the result of the comparison of two vectors with either and OR the + result. Either the elements are equal or one will equal the comparison + result if either is zero. + */ + vector bool char __tmp1 = vec_cmpeq(__a, __b); + vector bool char __tmp2 = __tmp1 | + vec_cmpeq((vector signed char)__tmp1, __a) | + vec_cmpeq((vector signed char)__tmp1, __b); + + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)__tmp2); +#else + vec_cntlz((vector unsigned long long)__tmp2); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 3; + } + return __res[0] >> 3; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_or_eos_index(vector unsigned char __a, + vector unsigned char __b) { + vector bool char __tmp1 = vec_cmpeq(__a, __b); + vector bool char __tmp2 = __tmp1 | + vec_cmpeq((vector unsigned char)__tmp1, __a) | + vec_cmpeq((vector unsigned char)__tmp1, __b); + + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)__tmp2); +#else + vec_cntlz((vector unsigned long long)__tmp2); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 3; + } + return __res[0] >> 3; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_or_eos_index(vector signed short __a, vector signed short __b) { + vector bool short __tmp1 = vec_cmpeq(__a, __b); + vector bool short __tmp2 = __tmp1 | + vec_cmpeq((vector signed short)__tmp1, __a) | + vec_cmpeq((vector signed short)__tmp1, __b); + + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)__tmp2); +#else + vec_cntlz((vector unsigned long long)__tmp2); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 4; + } + return __res[0] >> 4; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_or_eos_index(vector unsigned short __a, + vector unsigned short __b) { + vector bool short __tmp1 = vec_cmpeq(__a, __b); + vector bool short __tmp2 = __tmp1 | + vec_cmpeq((vector unsigned short)__tmp1, __a) | + vec_cmpeq((vector unsigned short)__tmp1, __b); + + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)__tmp2); +#else + vec_cntlz((vector unsigned long long)__tmp2); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 4; + } + return __res[0] >> 4; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_or_eos_index(vector signed int __a, vector signed int __b) { + vector bool int __tmp1 = vec_cmpeq(__a, __b); + vector bool int __tmp2 = __tmp1 | vec_cmpeq((vector signed int)__tmp1, __a) | + vec_cmpeq((vector signed int)__tmp1, __b); + + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)__tmp2); +#else + vec_cntlz((vector unsigned long long)__tmp2); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 5; + } + return __res[0] >> 5; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_match_or_eos_index(vector unsigned int __a, vector unsigned int __b) { + vector bool int __tmp1 = vec_cmpeq(__a, __b); + vector bool int __tmp2 = __tmp1 | + vec_cmpeq((vector unsigned int)__tmp1, __a) | + vec_cmpeq((vector unsigned int)__tmp1, __b); + + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)__tmp2); +#else + vec_cntlz((vector unsigned long long)__tmp2); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 5; + } + return __res[0] >> 5; +} + +/* vec_first_mismatch_index */ + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_index(vector signed char __a, vector signed char __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 3; + } + return __res[0] >> 3; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_index(vector unsigned char __a, vector unsigned char __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 3; + } + return __res[0] >> 3; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_index(vector signed short __a, vector signed short __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 4; + } + return __res[0] >> 4; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_index(vector unsigned short __a, vector unsigned short __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 4; + } + return __res[0] >> 4; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_index(vector signed int __a, vector signed int __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 5; + } + return __res[0] >> 5; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_index(vector unsigned int __a, vector unsigned int __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 5; + } + return __res[0] >> 5; +} + +/* vec_first_mismatch_or_eos_index */ + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_or_eos_index(vector signed char __a, + vector signed char __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 3; + } + return __res[0] >> 3; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_or_eos_index(vector unsigned char __a, + vector unsigned char __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 3; + } + return __res[0] >> 3; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_or_eos_index(vector signed short __a, + vector signed short __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 4; + } + return __res[0] >> 4; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_or_eos_index(vector unsigned short __a, + vector unsigned short __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 4; + } + return __res[0] >> 4; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_or_eos_index(vector signed int __a, vector signed int __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 5; + } + return __res[0] >> 5; +} + +static __inline__ unsigned __ATTRS_o_ai +vec_first_mismatch_or_eos_index(vector unsigned int __a, + vector unsigned int __b) { + vector unsigned long long __res = +#ifdef __LITTLE_ENDIAN__ + vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b)); +#else + vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b)); +#endif + if (__res[0] == 64) { + return (__res[1] + 64) >> 5; + } + return __res[0] >> 5; +} + +static __inline__ vector double __ATTRS_o_ai +vec_insert_exp(vector double __a, vector unsigned long long __b) { + return __builtin_vsx_xviexpdp((vector unsigned long long)__a,__b); +} + +static __inline__ vector double __ATTRS_o_ai +vec_insert_exp(vector unsigned long long __a, vector unsigned long long __b) { + return __builtin_vsx_xviexpdp(__a,__b); +} + +static __inline__ vector float __ATTRS_o_ai +vec_insert_exp(vector float __a, vector unsigned int __b) { + return __builtin_vsx_xviexpsp((vector unsigned int)__a,__b); +} + +static __inline__ vector float __ATTRS_o_ai +vec_insert_exp(vector unsigned int __a, vector unsigned int __b) { + return __builtin_vsx_xviexpsp(__a,__b); +} + +#if defined(__powerpc64__) +static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a, + size_t __b) { + return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_xl_len(unsigned char *__a, size_t __b) { + return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a, + size_t __b) { + return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_xl_len(unsigned short *__a, size_t __b) { + return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a, + size_t __b) { + return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a, + size_t __b) { + return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) { + return (vector float)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_xl_len(signed __int128 *__a, size_t __b) { + return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_len(unsigned __int128 *__a, size_t __b) { + return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_xl_len(signed long long *__a, size_t __b) { + return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_xl_len(unsigned long long *__a, size_t __b) { + return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a, + size_t __b) { + return (vector double)__builtin_vsx_lxvl(__a, (__b << 56)); +} + +static __inline__ vector double __ATTRS_o_ai vec_xl_len_r(unsigned char *__a, + size_t __b) { + vector unsigned char __res = + (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56)); +#ifdef __LITTLE_ENDIAN__ + vector unsigned char __mask = + (vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL); + __res = (vector unsigned char)__builtin_altivec_vperm_4si( + (vector int)__res, (vector int)__res, __mask); +#endif + return __res; +} + +// vec_xst_len +static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned char __a, + unsigned char *__b, + size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed char __a, + signed char *__b, size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed short __a, + signed short *__b, size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned short __a, + unsigned short *__b, + size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed int __a, + signed int *__b, size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned int __a, + unsigned int *__b, size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector float __a, float *__b, + size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed __int128 __a, + signed __int128 *__b, + size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned __int128 __a, + unsigned __int128 *__b, + size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed long long __a, + signed long long *__b, + size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned long long __a, + unsigned long long *__b, + size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b, + size_t __c) { + return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56)); +} + +static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a, + unsigned char *__b, + size_t __c) { +#ifdef __LITTLE_ENDIAN__ + vector unsigned char __mask = + (vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL); + vector unsigned char __res = + __builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask); + return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56)); +#else + return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56)); +#endif +} +#endif +#endif + /* vec_cpsgn */ #ifdef __VSX__ @@ -2016,20 +2969,284 @@ vec_vctuxs(vector float __a, int __b) { return __builtin_altivec_vctuxs(__a, __b); } +/* vec_signed */ + +static __inline__ vector signed int __ATTRS_o_ai +vec_sld(vector signed int, vector signed int, unsigned const int __c); + +static __inline__ vector signed int __ATTRS_o_ai +vec_signed(vector float __a) { + return __builtin_convertvector(__a, vector signed int); +} + +#ifdef __VSX__ +static __inline__ vector signed long long __ATTRS_o_ai +vec_signed(vector double __a) { + return __builtin_convertvector(__a, vector signed long long); +} + +static __inline__ vector signed int __attribute__((__always_inline__)) +vec_signed2(vector double __a, vector double __b) { + return (vector signed int) { __a[0], __a[1], __b[0], __b[1] }; +} + +static __inline__ vector signed int __ATTRS_o_ai +vec_signede(vector double __a) { +#ifdef __LITTLE_ENDIAN__ + vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a); + return vec_sld(__ret, __ret, 12); +#else + return __builtin_vsx_xvcvdpsxws(__a); +#endif +} + +static __inline__ vector signed int __ATTRS_o_ai +vec_signedo(vector double __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvdpsxws(__a); +#else + vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a); + return vec_sld(__ret, __ret, 12); +#endif +} +#endif + +/* vec_unsigned */ + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_sld(vector unsigned int, vector unsigned int, unsigned const int __c); + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_unsigned(vector float __a) { + return __builtin_convertvector(__a, vector unsigned int); +} + +#ifdef __VSX__ +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_unsigned(vector double __a) { + return __builtin_convertvector(__a, vector unsigned long long); +} + +static __inline__ vector unsigned int __attribute__((__always_inline__)) +vec_unsigned2(vector double __a, vector double __b) { + return (vector unsigned int) { __a[0], __a[1], __b[0], __b[1] }; +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_unsignede(vector double __a) { +#ifdef __LITTLE_ENDIAN__ + vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a); + return vec_sld(__ret, __ret, 12); +#else + return __builtin_vsx_xvcvdpuxws(__a); +#endif +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_unsignedo(vector double __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvdpuxws(__a); +#else + vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a); + return vec_sld(__ret, __ret, 12); +#endif +} +#endif + +/* vec_float */ + +static __inline__ vector float __ATTRS_o_ai +vec_sld(vector float, vector float, unsigned const int __c); + +static __inline__ vector float __ATTRS_o_ai +vec_float(vector signed int __a) { + return __builtin_convertvector(__a, vector float); +} + +static __inline__ vector float __ATTRS_o_ai +vec_float(vector unsigned int __a) { + return __builtin_convertvector(__a, vector float); +} + +#ifdef __VSX__ +static __inline__ vector float __ATTRS_o_ai +vec_float2(vector signed long long __a, vector signed long long __b) { + return (vector float) { __a[0], __a[1], __b[0], __b[1] }; +} + +static __inline__ vector float __ATTRS_o_ai +vec_float2(vector unsigned long long __a, vector unsigned long long __b) { + return (vector float) { __a[0], __a[1], __b[0], __b[1] }; +} + +static __inline__ vector float __ATTRS_o_ai +vec_float2(vector double __a, vector double __b) { + return (vector float) { __a[0], __a[1], __b[0], __b[1] }; +} + +static __inline__ vector float __ATTRS_o_ai +vec_floate(vector signed long long __a) { +#ifdef __LITTLE_ENDIAN__ + vector float __ret = __builtin_vsx_xvcvsxdsp(__a); + return vec_sld(__ret, __ret, 12); +#else + return __builtin_vsx_xvcvsxdsp(__a); +#endif +} + +static __inline__ vector float __ATTRS_o_ai +vec_floate(vector unsigned long long __a) { +#ifdef __LITTLE_ENDIAN__ + vector float __ret = __builtin_vsx_xvcvuxdsp(__a); + return vec_sld(__ret, __ret, 12); +#else + return __builtin_vsx_xvcvuxdsp(__a); +#endif +} + +static __inline__ vector float __ATTRS_o_ai +vec_floate(vector double __a) { +#ifdef __LITTLE_ENDIAN__ + vector float __ret = __builtin_vsx_xvcvdpsp(__a); + return vec_sld(__ret, __ret, 12); +#else + return __builtin_vsx_xvcvdpsp(__a); +#endif +} + +static __inline__ vector float __ATTRS_o_ai +vec_floato(vector signed long long __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvsxdsp(__a); +#else + vector float __ret = __builtin_vsx_xvcvsxdsp(__a); + return vec_sld(__ret, __ret, 12); +#endif +} + +static __inline__ vector float __ATTRS_o_ai +vec_floato(vector unsigned long long __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvuxdsp(__a); +#else + vector float __ret = __builtin_vsx_xvcvuxdsp(__a); + return vec_sld(__ret, __ret, 12); +#endif +} + +static __inline__ vector float __ATTRS_o_ai +vec_floato(vector double __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvdpsp(__a); +#else + vector float __ret = __builtin_vsx_xvcvdpsp(__a); + return vec_sld(__ret, __ret, 12); +#endif +} +#endif + /* vec_double */ #ifdef __VSX__ static __inline__ vector double __ATTRS_o_ai vec_double(vector signed long long __a) { + return __builtin_convertvector(__a, vector double); +} + +static __inline__ vector double __ATTRS_o_ai +vec_double(vector unsigned long long __a) { + return __builtin_convertvector(__a, vector double); +} + +static __inline__ vector double __ATTRS_o_ai +vec_doublee(vector signed int __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4)); +#else + return __builtin_vsx_xvcvsxwdp(__a); +#endif +} + +static __inline__ vector double __ATTRS_o_ai +vec_doublee(vector unsigned int __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4)); +#else + return __builtin_vsx_xvcvuxwdp(__a); +#endif +} + +static __inline__ vector double __ATTRS_o_ai +vec_doublee(vector float __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4)); +#else + return __builtin_vsx_xvcvspdp(__a); +#endif +} + +static __inline__ vector double __ATTRS_o_ai +vec_doubleh(vector signed int __a) { vector double __ret = {__a[0], __a[1]}; return __ret; } static __inline__ vector double __ATTRS_o_ai -vec_double(vector unsigned long long __a) { +vec_doubleh(vector unsigned int __a) { vector double __ret = {__a[0], __a[1]}; return __ret; } + +static __inline__ vector double __ATTRS_o_ai +vec_doubleh(vector float __a) { + vector double __ret = {__a[0], __a[1]}; + return __ret; +} + +static __inline__ vector double __ATTRS_o_ai +vec_doublel(vector signed int __a) { + vector double __ret = {__a[2], __a[3]}; + return __ret; +} + +static __inline__ vector double __ATTRS_o_ai +vec_doublel(vector unsigned int __a) { + vector double __ret = {__a[2], __a[3]}; + return __ret; +} + +static __inline__ vector double __ATTRS_o_ai +vec_doublel(vector float __a) { + vector double __ret = {__a[2], __a[3]}; + return __ret; +} + +static __inline__ vector double __ATTRS_o_ai +vec_doubleo(vector signed int __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvsxwdp(__a); +#else + return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4)); +#endif +} + +static __inline__ vector double __ATTRS_o_ai +vec_doubleo(vector unsigned int __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvuxwdp(__a); +#else + return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4)); +#endif +} + +static __inline__ vector double __ATTRS_o_ai +vec_doubleo(vector float __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_vsx_xvcvspdp(__a); +#else + return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4)); +#endif +} #endif /* vec_div */ @@ -3835,6 +5052,34 @@ vec_mergee(vector unsigned int __a, vector unsigned int __b) { 0x18, 0x19, 0x1A, 0x1B)); } +static __inline__ vector bool long long __ATTRS_o_ai +vec_mergee(vector bool long long __a, vector bool long long __b) { + return vec_mergeh(__a, __b); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_mergee(vector signed long long __a, vector signed long long __b) { + return vec_mergeh(__a, __b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_mergee(vector unsigned long long __a, vector unsigned long long __b) { + return vec_mergeh(__a, __b); +} + +static __inline__ vector float __ATTRS_o_ai +vec_mergee(vector float __a, vector float __b) { + return vec_perm(__a, __b, + (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, + 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, + 0x18, 0x19, 0x1A, 0x1B)); +} + +static __inline__ vector double __ATTRS_o_ai +vec_mergee(vector double __a, vector double __b) { + return vec_mergeh(__a, __b); +} + /* vec_mergeo */ static __inline__ vector bool int __ATTRS_o_ai vec_mergeo(vector bool int __a, @@ -3861,6 +5106,34 @@ vec_mergeo(vector unsigned int __a, vector unsigned int __b) { 0x1C, 0x1D, 0x1E, 0x1F)); } +static __inline__ vector bool long long __ATTRS_o_ai +vec_mergeo(vector bool long long __a, vector bool long long __b) { + return vec_mergel(__a, __b); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_mergeo(vector signed long long __a, vector signed long long __b) { + return vec_mergel(__a, __b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_mergeo(vector unsigned long long __a, vector unsigned long long __b) { + return vec_mergel(__a, __b); +} + +static __inline__ vector float __ATTRS_o_ai +vec_mergeo(vector float __a, vector float __b) { + return vec_perm(__a, __b, + (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15, + 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, + 0x1C, 0x1D, 0x1E, 0x1F)); +} + +static __inline__ vector double __ATTRS_o_ai +vec_mergeo(vector double __a, vector double __b) { + return vec_mergel(__a, __b); +} + #endif /* vec_mfvscr */ @@ -4689,6 +5962,12 @@ static __inline__ vector bool int __ATTRS_o_ai vec_nand(vector bool int __a, return ~(__a & __b); } +static __inline__ vector float __ATTRS_o_ai +vec_nand(vector float __a, vector float __b) { + return (vector float)(~((vector unsigned int)__a & + (vector unsigned int)__b)); +} + static __inline__ vector signed long long __ATTRS_o_ai vec_nand(vector signed long long __a, vector signed long long __b) { return ~(__a & __b); @@ -4724,6 +6003,12 @@ vec_nand(vector bool long long __a, vector bool long long __b) { return ~(__a & __b); } +static __inline__ vector double __ATTRS_o_ai +vec_nand(vector double __a, vector double __b) { + return (vector double)(~((vector unsigned long long)__a & + (vector unsigned long long)__b)); +} + #endif /* vec_nmadd */ @@ -5195,6 +6480,16 @@ static __inline__ vector bool int __ATTRS_o_ai vec_orc(vector bool int __a, return __a | ~__b; } +static __inline__ vector float __ATTRS_o_ai +vec_orc(vector bool int __a, vector float __b) { + return (vector float)(__a | ~(vector unsigned int)__b); +} + +static __inline__ vector float __ATTRS_o_ai +vec_orc(vector float __a, vector bool int __b) { + return (vector float)((vector unsigned int)__a | ~__b); +} + static __inline__ vector signed long long __ATTRS_o_ai vec_orc(vector signed long long __a, vector signed long long __b) { return __a | ~__b; @@ -5229,6 +6524,16 @@ static __inline__ vector bool long long __ATTRS_o_ai vec_orc(vector bool long long __a, vector bool long long __b) { return __a | ~__b; } + +static __inline__ vector double __ATTRS_o_ai +vec_orc(vector double __a, vector bool long long __b) { + return (vector double)((vector unsigned long long)__a | ~__b); +} + +static __inline__ vector double __ATTRS_o_ai +vec_orc(vector bool long long __a, vector double __b) { + return (vector double)(__a | ~(vector unsigned long long)__b); +} #endif /* vec_vor */ @@ -5536,8 +6841,25 @@ vec_pack(vector bool long long __a, vector bool long long __b) { #endif } +static __inline__ vector float __ATTRS_o_ai +vec_pack(vector double __a, vector double __b) { + return (vector float) (__a[0], __a[1], __b[0], __b[1]); +} #endif +#ifdef __POWER9_VECTOR__ +static __inline__ vector unsigned short __ATTRS_o_ai +vec_pack_to_short_fp32(vector float __a, vector float __b) { + vector float __resa = __builtin_vsx_xvcvsphp(__a); + vector float __resb = __builtin_vsx_xvcvsphp(__b); +#ifdef __LITTLE_ENDIAN__ + return (vector unsigned short)vec_mergee(__resa, __resb); +#else + return (vector unsigned short)vec_mergeo(__resa, __resb); +#endif +} + +#endif /* vec_vpkuhum */ #define __builtin_altivec_vpkuhum vec_vpkuhum @@ -6324,6 +7646,34 @@ vec_rl(vector unsigned long long __a, vector unsigned long long __b) { } #endif +/* vec_rlmi */ +#ifdef __POWER9_VECTOR__ +static __inline__ vector unsigned int __ATTRS_o_ai +vec_rlmi(vector unsigned int __a, vector unsigned int __b, + vector unsigned int __c) { + return __builtin_altivec_vrlwmi(__a, __c, __b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_rlmi(vector unsigned long long __a, vector unsigned long long __b, + vector unsigned long long __c) { + return __builtin_altivec_vrldmi(__a, __c, __b); +} + +/* vec_rlnm */ +static __inline__ vector unsigned int __ATTRS_o_ai +vec_rlnm(vector unsigned int __a, vector unsigned int __b, + vector unsigned int __c) { + return __builtin_altivec_vrlwnm(__a, __b) & __c; +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_rlnm(vector unsigned long long __a, vector unsigned long long __b, + vector unsigned long long __c) { + return __builtin_altivec_vrldnm(__a, __b) & __c; +} +#endif + /* vec_vrlb */ static __inline__ vector signed char __ATTRS_o_ai @@ -6984,6 +8334,145 @@ static __inline__ vector float __ATTRS_o_ai vec_sld(vector float __a, #endif } +#ifdef __VSX__ +static __inline__ vector bool long long __ATTRS_o_ai +vec_sld(vector bool long long __a, vector bool long long __b, + unsigned const int __c) { + unsigned char __d = __c & 0x0F; +#ifdef __LITTLE_ENDIAN__ + return vec_perm( + __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, + 20 - __d, 21 - __d, 22 - __d, 23 - __d, + 24 - __d, 25 - __d, 26 - __d, 27 - __d, + 28 - __d, 29 - __d, 30 - __d, 31 - __d)); +#else + return vec_perm( + __a, __b, + (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5, + __d + 6, __d + 7, __d + 8, __d + 9, __d + 10, + __d + 11, __d + 12, __d + 13, __d + 14, __d + 15)); +#endif +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_sld(vector signed long long __a, vector signed long long __b, + unsigned const int __c) { + unsigned char __d = __c & 0x0F; +#ifdef __LITTLE_ENDIAN__ + return vec_perm( + __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, + 20 - __d, 21 - __d, 22 - __d, 23 - __d, + 24 - __d, 25 - __d, 26 - __d, 27 - __d, + 28 - __d, 29 - __d, 30 - __d, 31 - __d)); +#else + return vec_perm( + __a, __b, + (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5, + __d + 6, __d + 7, __d + 8, __d + 9, __d + 10, + __d + 11, __d + 12, __d + 13, __d + 14, __d + 15)); +#endif +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_sld(vector unsigned long long __a, vector unsigned long long __b, + unsigned const int __c) { + unsigned char __d = __c & 0x0F; +#ifdef __LITTLE_ENDIAN__ + return vec_perm( + __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, + 20 - __d, 21 - __d, 22 - __d, 23 - __d, + 24 - __d, 25 - __d, 26 - __d, 27 - __d, + 28 - __d, 29 - __d, 30 - __d, 31 - __d)); +#else + return vec_perm( + __a, __b, + (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5, + __d + 6, __d + 7, __d + 8, __d + 9, __d + 10, + __d + 11, __d + 12, __d + 13, __d + 14, __d + 15)); +#endif +} + +static __inline__ vector double __ATTRS_o_ai vec_sld(vector double __a, + vector double __b, + unsigned const int __c) { + unsigned char __d = __c & 0x0F; +#ifdef __LITTLE_ENDIAN__ + return vec_perm( + __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d, + 20 - __d, 21 - __d, 22 - __d, 23 - __d, + 24 - __d, 25 - __d, 26 - __d, 27 - __d, + 28 - __d, 29 - __d, 30 - __d, 31 - __d)); +#else + return vec_perm( + __a, __b, + (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5, + __d + 6, __d + 7, __d + 8, __d + 9, __d + 10, + __d + 11, __d + 12, __d + 13, __d + 14, __d + 15)); +#endif +} +#endif + +/* vec_sldw */ +static __inline__ vector signed char __ATTRS_o_ai vec_sldw( + vector signed char __a, vector signed char __b, unsigned const int __c) { + return vec_sld(__a, __b, ((__c << 2) & 0x0F)); +} + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_sldw(vector unsigned char __a, vector unsigned char __b, + unsigned const int __c) { + return vec_sld(__a, __b, ((__c << 2) & 0x0F)); +} + +static __inline__ vector signed short __ATTRS_o_ai vec_sldw( + vector signed short __a, vector signed short __b, unsigned const int __c) { + return vec_sld(__a, __b, ((__c << 2) & 0x0F)); +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_sldw(vector unsigned short __a, vector unsigned short __b, + unsigned const int __c) { + return vec_sld(__a, __b, ((__c << 2) & 0x0F)); +} + +static __inline__ vector signed int __ATTRS_o_ai +vec_sldw(vector signed int __a, vector signed int __b, unsigned const int __c) { + return vec_sld(__a, __b, ((__c << 2) & 0x0F)); +} + +static __inline__ vector unsigned int __ATTRS_o_ai vec_sldw( + vector unsigned int __a, vector unsigned int __b, unsigned const int __c) { + return vec_sld(__a, __b, ((__c << 2) & 0x0F)); +} + +#ifdef __VSX__ +static __inline__ vector signed long long __ATTRS_o_ai +vec_sldw(vector signed long long __a, vector signed long long __b, + unsigned const int __c) { + return vec_sld(__a, __b, ((__c << 2) & 0x0F)); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_sldw(vector unsigned long long __a, vector unsigned long long __b, + unsigned const int __c) { + return vec_sld(__a, __b, ((__c << 2) & 0x0F)); +} +#endif + +#ifdef __POWER9_VECTOR__ +/* vec_slv */ +static __inline__ vector unsigned char __ATTRS_o_ai +vec_slv(vector unsigned char __a, vector unsigned char __b) { + return __builtin_altivec_vslv(__a, __b); +} + +/* vec_srv */ +static __inline__ vector unsigned char __ATTRS_o_ai +vec_srv(vector unsigned char __a, vector unsigned char __b) { + return __builtin_altivec_vsrv(__a, __b); +} +#endif + /* vec_vsldoi */ static __inline__ vector signed char __ATTRS_o_ai @@ -7307,6 +8796,20 @@ vec_sll(vector bool int __a, vector unsigned int __b) { (vector int)__b); } +#ifdef __VSX__ +static __inline__ vector signed long long __ATTRS_o_ai +vec_sll(vector signed long long __a, vector unsigned char __b) { + return (vector signed long long)__builtin_altivec_vsl((vector int)__a, + (vector int)__b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_sll(vector unsigned long long __a, vector unsigned char __b) { + return (vector unsigned long long)__builtin_altivec_vsl((vector int)__a, + (vector int)__b); +} +#endif + /* vec_vsl */ static __inline__ vector signed char __ATTRS_o_ai @@ -7570,6 +9073,32 @@ static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a, return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b); } +#ifdef __VSX__ +static __inline__ vector signed long long __ATTRS_o_ai +vec_slo(vector signed long long __a, vector signed char __b) { + return (vector signed long long)__builtin_altivec_vslo((vector int)__a, + (vector int)__b); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_slo(vector signed long long __a, vector unsigned char __b) { + return (vector signed long long)__builtin_altivec_vslo((vector int)__a, + (vector int)__b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_slo(vector unsigned long long __a, vector signed char __b) { + return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a, + (vector int)__b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_slo(vector unsigned long long __a, vector unsigned char __b) { + return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a, + (vector int)__b); +} +#endif + /* vec_vslo */ static __inline__ vector signed char __ATTRS_o_ai @@ -8304,6 +9833,20 @@ vec_srl(vector bool int __a, vector unsigned int __b) { (vector int)__b); } +#ifdef __VSX__ +static __inline__ vector signed long long __ATTRS_o_ai +vec_srl(vector signed long long __a, vector unsigned char __b) { + return (vector signed long long)__builtin_altivec_vsr((vector int)__a, + (vector int)__b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_srl(vector unsigned long long __a, vector unsigned char __b) { + return (vector unsigned long long)__builtin_altivec_vsr((vector int)__a, + (vector int)__b); +} +#endif + /* vec_vsr */ static __inline__ vector signed char __ATTRS_o_ai @@ -8567,6 +10110,32 @@ static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a, return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b); } +#ifdef __VSX__ +static __inline__ vector signed long long __ATTRS_o_ai +vec_sro(vector signed long long __a, vector signed char __b) { + return (vector signed long long)__builtin_altivec_vsro((vector int)__a, + (vector int)__b); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_sro(vector signed long long __a, vector unsigned char __b) { + return (vector signed long long)__builtin_altivec_vsro((vector int)__a, + (vector int)__b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_sro(vector unsigned long long __a, vector signed char __b) { + return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a, + (vector int)__b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_sro(vector unsigned long long __a, vector unsigned char __b) { + return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a, + (vector int)__b); +} +#endif + /* vec_vsro */ static __inline__ vector signed char __ATTRS_o_ai @@ -9580,6 +11149,12 @@ vec_vsubfp(vector float __a, vector float __b) { /* vec_subc */ +static __inline__ vector signed int __ATTRS_o_ai +vec_subc(vector signed int __a, vector signed int __b) { + return (vector signed int)__builtin_altivec_vsubcuw((vector unsigned int)__a, + (vector unsigned int) __b); +} + static __inline__ vector unsigned int __ATTRS_o_ai vec_subc(vector unsigned int __a, vector unsigned int __b) { return __builtin_altivec_vsubcuw(__a, __b); @@ -9813,6 +11388,7 @@ vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) { /* vec_vsubeuqm */ + static __inline__ vector signed __int128 __ATTRS_o_ai vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b, vector signed __int128 __c) { @@ -9825,6 +11401,18 @@ vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b, return __builtin_altivec_vsubeuqm(__a, __b, __c); } +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_sube(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) { + return __builtin_altivec_vsubeuqm(__a, __b, __c); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_sube(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vsubeuqm(__a, __b, __c); +} + /* vec_vsubcuq */ static __inline__ vector signed __int128 __ATTRS_o_ai @@ -9850,8 +11438,47 @@ vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b, vector unsigned __int128 __c) { return __builtin_altivec_vsubecuq(__a, __b, __c); } + +static __inline__ vector signed int __ATTRS_o_ai +vec_subec(vector signed int __a, vector signed int __b, + vector signed int __c) { + return vec_addec(__a, ~__b, __c); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_subec(vector unsigned int __a, vector unsigned int __b, + vector unsigned int __c) { + return vec_addec(__a, ~__b, __c); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_subec(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) { + return __builtin_altivec_vsubecuq(__a, __b, __c); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_subec(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vsubecuq(__a, __b, __c); +} #endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static __inline__ vector signed int __ATTRS_o_ai +vec_sube(vector signed int __a, vector signed int __b, + vector signed int __c) { + vector signed int __mask = {1, 1, 1, 1}; + vector signed int __carry = __c & __mask; + return vec_adde(__a, ~__b, __carry); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_sube(vector unsigned int __a, vector unsigned int __b, + vector unsigned int __c) { + vector unsigned int __mask = {1, 1, 1, 1}; + vector unsigned int __carry = __c & __mask; + return vec_adde(__a, ~__b, __carry); +} /* vec_sum4s */ static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, @@ -10051,6 +11678,11 @@ vec_unpackh(vector bool int __a) { return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a); #endif } + +static __inline__ vector double __ATTRS_o_ai +vec_unpackh(vector float __a) { + return (vector double)(__a[0], __a[1]); +} #endif /* vec_vupkhsb */ @@ -10185,6 +11817,11 @@ vec_unpackl(vector bool int __a) { return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a); #endif } + +static __inline__ vector double __ATTRS_o_ai +vec_unpackl(vector float __a) { + return (vector double)(__a[2], __a[3]); +} #endif /* vec_vupklsb */ @@ -10935,6 +12572,55 @@ static __inline__ float __ATTRS_o_ai vec_extract(vector float __a, int __b) { return __a[__b]; } +#ifdef __POWER9_VECTOR__ + +/* vec_extract_exp */ + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_extract_exp(vector float __a) { + return __builtin_vsx_xvxexpsp(__a); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_extract_exp(vector double __a) { + return __builtin_vsx_xvxexpdp(__a); +} + +/* vec_extract_sig */ + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_extract_sig(vector float __a) { + return __builtin_vsx_xvxsigsp(__a); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_extract_sig (vector double __a) { + return __builtin_vsx_xvxsigdp(__a); +} + +static __inline__ vector float __ATTRS_o_ai +vec_extract_fp32_from_shorth(vector unsigned short __a) { + vector unsigned short __b = +#ifdef __LITTLE_ENDIAN__ + __builtin_shufflevector(__a, __a, 0, -1, 1, -1, 2, -1, 3, -1); +#else + __builtin_shufflevector(__a, __a, -1, 0, -1, 1, -1, 2, -1, 3); +#endif + return __builtin_vsx_xvcvhpsp(__b); +} + +static __inline__ vector float __ATTRS_o_ai +vec_extract_fp32_from_shortl(vector unsigned short __a) { + vector unsigned short __b = +#ifdef __LITTLE_ENDIAN__ + __builtin_shufflevector(__a, __a, 4, -1, 5, -1, 6, -1, 7, -1); +#else + __builtin_shufflevector(__a, __a, -1, 4, -1, 5, -1, 6, -1, 7); +#endif + return __builtin_vsx_xvcvhpsp(__b); +} +#endif /* __POWER9_VECTOR__ */ + /* vec_insert */ static __inline__ vector signed char __ATTRS_o_ai @@ -14369,6 +16055,24 @@ __builtin_crypto_vncipherlast(vector unsigned long long __a, #endif #ifdef __POWER8_VECTOR__ +static __inline__ vector bool char __ATTRS_o_ai +vec_permxor(vector bool char __a, vector bool char __b, + vector bool char __c) { + return __builtin_altivec_crypto_vpermxor(__a, __b, __c); +} + +static __inline__ vector signed char __ATTRS_o_ai +vec_permxor(vector signed char __a, vector signed char __b, + vector signed char __c) { + return __builtin_altivec_crypto_vpermxor(__a, __b, __c); +} + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_permxor(vector unsigned char __a, vector unsigned char __b, + vector unsigned char __c) { + return __builtin_altivec_crypto_vpermxor(__a, __b, __c); +} + static __inline__ vector unsigned char __ATTRS_o_ai __builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b, vector unsigned char __c) { @@ -14453,6 +16157,572 @@ vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) { #endif #endif + +/* vec_reve */ + +static inline __ATTRS_o_ai vector bool char vec_reve(vector bool char __a) { + return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector signed char vec_reve(vector signed char __a) { + return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_reve(vector unsigned char __a) { + return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector bool int vec_reve(vector bool int __a) { + return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector signed int vec_reve(vector signed int __a) { + return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector unsigned int +vec_reve(vector unsigned int __a) { + return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector bool short vec_reve(vector bool short __a) { + return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector signed short +vec_reve(vector signed short __a) { + return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector unsigned short +vec_reve(vector unsigned short __a) { + return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); +} + +static inline __ATTRS_o_ai vector float vec_reve(vector float __a) { + return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); +} + +#ifdef __VSX__ +static inline __ATTRS_o_ai vector bool long long +vec_reve(vector bool long long __a) { + return __builtin_shufflevector(__a, __a, 1, 0); +} + +static inline __ATTRS_o_ai vector signed long long +vec_reve(vector signed long long __a) { + return __builtin_shufflevector(__a, __a, 1, 0); +} + +static inline __ATTRS_o_ai vector unsigned long long +vec_reve(vector unsigned long long __a) { + return __builtin_shufflevector(__a, __a, 1, 0); +} + +static inline __ATTRS_o_ai vector double vec_reve(vector double __a) { + return __builtin_shufflevector(__a, __a, 1, 0); +} +#endif + +/* vec_revb */ +static __inline__ vector bool char __ATTRS_o_ai +vec_revb(vector bool char __a) { + return __a; +} + +static __inline__ vector signed char __ATTRS_o_ai +vec_revb(vector signed char __a) { + return __a; +} + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_revb(vector unsigned char __a) { + return __a; +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_revb(vector bool short __a) { + vector unsigned char __indices = + { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector signed short __ATTRS_o_ai +vec_revb(vector signed short __a) { + vector unsigned char __indices = + { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_revb(vector unsigned short __a) { + vector unsigned char __indices = + { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_revb(vector bool int __a) { + vector unsigned char __indices = + { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector signed int __ATTRS_o_ai +vec_revb(vector signed int __a) { + vector unsigned char __indices = + { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_revb(vector unsigned int __a) { + vector unsigned char __indices = + { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector float __ATTRS_o_ai +vec_revb(vector float __a) { + vector unsigned char __indices = + { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; + return vec_perm(__a, __a, __indices); +} + +#ifdef __VSX__ +static __inline__ vector bool long long __ATTRS_o_ai +vec_revb(vector bool long long __a) { + vector unsigned char __indices = + { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_revb(vector signed long long __a) { + vector unsigned char __indices = + { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_revb(vector unsigned long long __a) { + vector unsigned char __indices = + { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; + return vec_perm(__a, __a, __indices); +} + +static __inline__ vector double __ATTRS_o_ai +vec_revb(vector double __a) { + vector unsigned char __indices = + { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; + return vec_perm(__a, __a, __indices); +} +#endif /* End __VSX__ */ + +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_revb(vector signed __int128 __a) { + vector unsigned char __indices = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + return (vector signed __int128)vec_perm((vector signed int)__a, + (vector signed int)__a, + __indices); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_revb(vector unsigned __int128 __a) { + vector unsigned char __indices = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + return (vector unsigned __int128)vec_perm((vector signed int)__a, + (vector signed int)__a, + __indices); +} +#endif /* END __POWER8_VECTOR__ && __powerpc64__ */ + +/* vec_xl */ + +static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset, + signed char *__ptr) { + return *(vector signed char *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_xl(signed long long __offset, unsigned char *__ptr) { + return *(vector unsigned char *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset, + signed short *__ptr) { + return *(vector signed short *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned short +vec_xl(signed long long __offset, unsigned short *__ptr) { + return *(vector unsigned short *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset, + signed int *__ptr) { + return *(vector signed int *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset, + unsigned int *__ptr) { + return *(vector unsigned int *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset, + float *__ptr) { + return *(vector float *)(__ptr + __offset); +} + +#ifdef __VSX__ +static inline __ATTRS_o_ai vector signed long long +vec_xl(signed long long __offset, signed long long *__ptr) { + return *(vector signed long long *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned long long +vec_xl(signed long long __offset, unsigned long long *__ptr) { + return *(vector unsigned long long *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset, + double *__ptr) { + return *(vector double *)(__ptr + __offset); +} +#endif + +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static inline __ATTRS_o_ai vector signed __int128 +vec_xl(signed long long __offset, signed __int128 *__ptr) { + return *(vector signed __int128 *)(__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned __int128 +vec_xl(signed long long __offset, unsigned __int128 *__ptr) { + return *(vector unsigned __int128 *)(__ptr + __offset); +} +#endif + +/* vec_xl_be */ + +#ifdef __LITTLE_ENDIAN__ +static __inline__ vector signed char __ATTRS_o_ai +vec_xl_be(signed long long __offset, signed char *__ptr) { + vector signed char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr); + return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, + 13, 12, 11, 10, 9, 8); +} + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_xl_be(signed long long __offset, unsigned char *__ptr) { + vector unsigned char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr); + return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, + 13, 12, 11, 10, 9, 8); +} + +static __inline__ vector signed short __ATTRS_o_ai +vec_xl_be(signed long long __offset, signed short *__ptr) { + vector signed short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr); + return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4); +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_xl_be(signed long long __offset, unsigned short *__ptr) { + vector unsigned short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr); + return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4); +} + +static __inline__ vector signed int __ATTRS_o_ai +vec_xl_be(signed long long __offset, signed int *__ptr) { + return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_xl_be(signed long long __offset, unsigned int *__ptr) { + return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr); +} + +static __inline__ vector float __ATTRS_o_ai +vec_xl_be(signed long long __offset, float *__ptr) { + return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr); +} + +#ifdef __VSX__ +static __inline__ vector signed long long __ATTRS_o_ai +vec_xl_be(signed long long __offset, signed long long *__ptr) { + return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_xl_be(signed long long __offset, unsigned long long *__ptr) { + return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr); +} + +static __inline__ vector double __ATTRS_o_ai +vec_xl_be(signed long long __offset, double *__ptr) { + return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr); +} +#endif + +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_xl_be(signed long long __offset, signed __int128 *__ptr) { + return vec_xl(__offset, __ptr); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_be(signed long long __offset, unsigned __int128 *__ptr) { + return vec_xl(__offset, __ptr); +} +#endif +#else + #define vec_xl_be vec_xl +#endif + +/* vec_xst */ + +static inline __ATTRS_o_ai void vec_xst(vector signed char __vec, + signed long long __offset, + signed char *__ptr) { + *(vector signed char *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector unsigned char __vec, + signed long long __offset, + unsigned char *__ptr) { + *(vector unsigned char *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector signed short __vec, + signed long long __offset, + signed short *__ptr) { + *(vector signed short *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec, + signed long long __offset, + unsigned short *__ptr) { + *(vector unsigned short *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector signed int __vec, + signed long long __offset, + signed int *__ptr) { + *(vector signed int *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector unsigned int __vec, + signed long long __offset, + unsigned int *__ptr) { + *(vector unsigned int *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector float __vec, + signed long long __offset, + float *__ptr) { + *(vector float *)(__ptr + __offset) = __vec; +} + +#ifdef __VSX__ +static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec, + signed long long __offset, + signed long long *__ptr) { + *(vector signed long long *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec, + signed long long __offset, + unsigned long long *__ptr) { + *(vector unsigned long long *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector double __vec, + signed long long __offset, + double *__ptr) { + *(vector double *)(__ptr + __offset) = __vec; +} +#endif + +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec, + signed long long __offset, + signed __int128 *__ptr) { + *(vector signed __int128 *)(__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec, + signed long long __offset, + unsigned __int128 *__ptr) { + *(vector unsigned __int128 *)(__ptr + __offset) = __vec; +} +#endif + +/* vec_xst_be */ + +#ifdef __LITTLE_ENDIAN__ +static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed char __vec, + signed long long __offset, + signed char *__ptr) { + vector signed char __tmp = + __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, + 13, 12, 11, 10, 9, 8); + __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec, + signed long long __offset, + unsigned char *__ptr) { + vector unsigned char __tmp = + __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, + 13, 12, 11, 10, 9, 8); + __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec, + signed long long __offset, + signed short *__ptr) { + vector signed short __tmp = + __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4); + __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec, + signed long long __offset, + unsigned short *__ptr) { + vector unsigned short __tmp = + __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4); + __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec, + signed long long __offset, + signed int *__ptr) { + __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned int __vec, + signed long long __offset, + unsigned int *__ptr) { + __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector float __vec, + signed long long __offset, + float *__ptr) { + __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr); +} + +#ifdef __VSX__ +static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed long long __vec, + signed long long __offset, + signed long long *__ptr) { + __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned long long __vec, + signed long long __offset, + unsigned long long *__ptr) { + __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector double __vec, + signed long long __offset, + double *__ptr) { + __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr); +} +#endif + +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed __int128 __vec, + signed long long __offset, + signed __int128 *__ptr) { + vec_xst(__vec, __offset, __ptr); +} + +static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned __int128 __vec, + signed long long __offset, + unsigned __int128 *__ptr) { + vec_xst(__vec, __offset, __ptr); +} +#endif +#else + #define vec_xst_be vec_xst +#endif + +#ifdef __POWER9_VECTOR__ +#define vec_test_data_class(__a, __b) \ + _Generic((__a), \ + vector float: \ + (vector bool int)__builtin_vsx_xvtstdcsp((__a), (__b)), \ + vector double: \ + (vector bool long long)__builtin_vsx_xvtstdcdp((__a), (__b)) \ + ) + +#endif /* #ifdef __POWER9_VECTOR__ */ + +static vector float __ATTRS_o_ai vec_neg(vector float __a) { + return -__a; +} + +#ifdef __VSX__ +static vector double __ATTRS_o_ai vec_neg(vector double __a) { + return -__a; +} + +#endif + +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static vector long long __ATTRS_o_ai vec_neg(vector long long __a) { + return -__a; +} +#endif + +static vector signed int __ATTRS_o_ai vec_neg(vector signed int __a) { + return -__a; +} + +static vector signed short __ATTRS_o_ai vec_neg(vector signed short __a) { + return -__a; +} + +static vector signed char __ATTRS_o_ai vec_neg(vector signed char __a) { + return -__a; +} + +static vector float __ATTRS_o_ai vec_nabs(vector float __a) { + return - vec_abs(__a); +} + +#ifdef __VSX__ +static vector double __ATTRS_o_ai vec_nabs(vector double __a) { + return - vec_abs(__a); +} + +#endif + +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static vector long long __ATTRS_o_ai vec_nabs(vector long long __a) { + return __builtin_altivec_vminsd(__a, -__a); +} +#endif + +static vector signed int __ATTRS_o_ai vec_nabs(vector signed int __a) { + return __builtin_altivec_vminsw(__a, -__a); +} + +static vector signed short __ATTRS_o_ai vec_nabs(vector signed short __a) { + return __builtin_altivec_vminsh(__a, -__a); +} + +static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) { + return __builtin_altivec_vminsb(__a, -__a); +} #undef __ATTRS_o_ai #endif /* __ALTIVEC_H */ diff --git a/lib/Headers/ammintrin.h b/lib/Headers/ammintrin.h index 8985bb404f47..2843a7a2677f 100644 --- a/lib/Headers/ammintrin.h +++ b/lib/Headers/ammintrin.h @@ -30,7 +30,7 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"))) /// \brief Extracts the specified bits from the lower 64 bits of the 128-bit -/// integer vector operand at the index idx and of the length len. +/// integer vector operand at the index \a idx and of the length \a len. /// /// \headerfile <x86intrin.h> /// @@ -38,7 +38,7 @@ /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx); /// \endcode /// -/// This intrinsic corresponds to the \c EXTRQ instruction. +/// This intrinsic corresponds to the <c> EXTRQ </c> instruction. /// /// \param x /// The value from which bits are extracted. @@ -49,8 +49,8 @@ /// Bits [5:0] specify the index of the least significant bit; the other /// bits are ignored. If the sum of the index and length is greater than 64, /// the result is undefined. If the length and index are both zero, bits -/// [63:0] of parameter x are extracted. If the length is zero but the index -/// is non-zero, the result is undefined. +/// [63:0] of parameter \a x are extracted. If the length is zero but the +/// index is non-zero, the result is undefined. /// \returns A 128-bit integer vector whose lower 64 bits contain the bits /// extracted from the source operand. #define _mm_extracti_si64(x, len, idx) \ @@ -58,11 +58,12 @@ (char)(len), (char)(idx))) /// \brief Extracts the specified bits from the lower 64 bits of the 128-bit -/// integer vector operand at the index and of the length specified by __y. +/// integer vector operand at the index and of the length specified by +/// \a __y. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c EXTRQ instruction. +/// This intrinsic corresponds to the <c> EXTRQ </c> instruction. /// /// \param __x /// The value from which bits are extracted. @@ -71,8 +72,8 @@ /// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the /// length is interpreted as 64. If the sum of the index and length is /// greater than 64, the result is undefined. If the length and index are -/// both zero, bits [63:0] of parameter __x are extracted. If the length is -/// zero but the index is non-zero, the result is undefined. +/// both zero, bits [63:0] of parameter \a __x are extracted. If the length +/// is zero but the index is non-zero, the result is undefined. /// \returns A 128-bit vector whose lower 64 bits contain the bits extracted /// from the source operand. static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -81,9 +82,9 @@ _mm_extract_si64(__m128i __x, __m128i __y) return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y); } -/// \brief Inserts bits of a specified length from the source integer vector y -/// into the lower 64 bits of the destination integer vector x at the index -/// idx and of the length len. +/// \brief Inserts bits of a specified length from the source integer vector +/// \a y into the lower 64 bits of the destination integer vector \a x at +/// the index \a idx and of the length \a len. /// /// \headerfile <x86intrin.h> /// @@ -92,15 +93,15 @@ _mm_extract_si64(__m128i __x, __m128i __y) /// const int idx); /// \endcode /// -/// This intrinsic corresponds to the \c INSERTQ instruction. +/// This intrinsic corresponds to the <c> INSERTQ </c> instruction. /// /// \param x /// The destination operand where bits will be inserted. The inserted bits -/// are defined by the length len and by the index idx specifying the least -/// significant bit. +/// are defined by the length \a len and by the index \a idx specifying the +/// least significant bit. /// \param y /// The source operand containing the bits to be extracted. The extracted -/// bits are the least significant bits of operand y of length len. +/// bits are the least significant bits of operand \a y of length \a len. /// \param len /// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0] /// are zero, the length is interpreted as 64. @@ -108,45 +109,43 @@ _mm_extract_si64(__m128i __x, __m128i __y) /// Bits [5:0] specify the index of the least significant bit; the other /// bits are ignored. If the sum of the index and length is greater than 64, /// the result is undefined. If the length and index are both zero, bits -/// [63:0] of parameter y are inserted into parameter x. If the length is -/// zero but the index is non-zero, the result is undefined. +/// [63:0] of parameter \a y are inserted into parameter \a x. If the length +/// is zero but the index is non-zero, the result is undefined. /// \returns A 128-bit integer vector containing the original lower 64-bits of -/// destination operand x with the specified bitfields replaced by the lower -/// bits of source operand y. The upper 64 bits of the return value are -/// undefined. - +/// destination operand \a x with the specified bitfields replaced by the +/// lower bits of source operand \a y. The upper 64 bits of the return value +/// are undefined. #define _mm_inserti_si64(x, y, len, idx) \ ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \ (__v2di)(__m128i)(y), \ (char)(len), (char)(idx))) /// \brief Inserts bits of a specified length from the source integer vector -/// __y into the lower 64 bits of the destination integer vector __x at the -/// index and of the length specified by __y. +/// \a __y into the lower 64 bits of the destination integer vector \a __x +/// at the index and of the length specified by \a __y. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c INSERTQ instruction. +/// This intrinsic corresponds to the <c> INSERTQ </c> instruction. /// /// \param __x /// The destination operand where bits will be inserted. The inserted bits /// are defined by the length and by the index of the least significant bit -/// specified by operand __y. +/// specified by operand \a __y. /// \param __y /// The source operand containing the bits to be extracted. The extracted -/// bits are the least significant bits of operand __y with length specified -/// by bits [69:64]. These are inserted into the destination at the index -/// specified by bits [77:72]; all other bits are ignored. If bits [69:64] -/// are zero, the length is interpreted as 64. If the sum of the index and -/// length is greater than 64, the result is undefined. If the length and -/// index are both zero, bits [63:0] of parameter __y are inserted into -/// parameter __x. If the length is zero but the index is non-zero, the -/// result is undefined. +/// bits are the least significant bits of operand \a __y with length +/// specified by bits [69:64]. These are inserted into the destination at the +/// index specified by bits [77:72]; all other bits are ignored. If bits +/// [69:64] are zero, the length is interpreted as 64. If the sum of the +/// index and length is greater than 64, the result is undefined. If the +/// length and index are both zero, bits [63:0] of parameter \a __y are +/// inserted into parameter \a __x. If the length is zero but the index is +/// non-zero, the result is undefined. /// \returns A 128-bit integer vector containing the original lower 64-bits of -/// destination operand __x with the specified bitfields replaced by the -/// lower bits of source operand __y. The upper 64 bits of the return value -/// are undefined. - +/// destination operand \a __x with the specified bitfields replaced by the +/// lower bits of source operand \a __y. The upper 64 bits of the return +/// value are undefined. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_insert_si64(__m128i __x, __m128i __y) { @@ -159,7 +158,7 @@ _mm_insert_si64(__m128i __x, __m128i __y) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c MOVNTSD instruction. +/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction. /// /// \param __p /// The 64-bit memory location used to store the register value. @@ -177,7 +176,7 @@ _mm_stream_sd(double *__p, __m128d __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c MOVNTSS instruction. +/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction. /// /// \param __p /// The 32-bit memory location used to store the register value. diff --git a/lib/Headers/armintr.h b/lib/Headers/armintr.h new file mode 100644 index 000000000000..933afcbb91b6 --- /dev/null +++ b/lib/Headers/armintr.h @@ -0,0 +1,45 @@ +/*===---- armintr.h - ARM Windows intrinsics -------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +/* Only include this if we're compiling for the windows platform. */ +#ifndef _MSC_VER +#include_next <armintr.h> +#else + +#ifndef __ARMINTR_H +#define __ARMINTR_H + +typedef enum +{ + _ARM_BARRIER_SY = 0xF, + _ARM_BARRIER_ST = 0xE, + _ARM_BARRIER_ISH = 0xB, + _ARM_BARRIER_ISHST = 0xA, + _ARM_BARRIER_NSH = 0x7, + _ARM_BARRIER_NSHST = 0x6, + _ARM_BARRIER_OSH = 0x3, + _ARM_BARRIER_OSHST = 0x2 +} _ARMINTR_BARRIER_TYPE; + +#endif /* __ARMINTR_H */ +#endif /* _MSC_VER */ diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h index d3c5a6c96446..629dc8611a7f 100644 --- a/lib/Headers/avx512bwintrin.h +++ b/lib/Headers/avx512bwintrin.h @@ -350,19 +350,17 @@ _mm512_add_epi8 (__m512i __A, __m512i __B) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); +_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_add_epi8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __U); +_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_add_epi8(__A, __B), + (__v64qi)_mm512_setzero_qi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -371,19 +369,17 @@ _mm512_sub_epi8 (__m512i __A, __m512i __B) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); +_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_sub_epi8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __U); +_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_sub_epi8(__A, __B), + (__v64qi)_mm512_setzero_qi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -392,19 +388,17 @@ _mm512_add_epi16 (__m512i __A, __m512i __B) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); +_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_add_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); +_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_add_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -413,19 +407,17 @@ _mm512_sub_epi16 (__m512i __A, __m512i __B) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); +_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sub_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); +_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sub_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -434,19 +426,17 @@ _mm512_mullo_epi16 (__m512i __A, __m512i __B) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); +_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mullo_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); +_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mullo_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1018,31 +1008,25 @@ _mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shuffle_epi8 (__m512i __A, __m512i __B) +_mm512_shuffle_epi8(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) +_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_shuffle_epi8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_shuffle_epi8(__A, __B), + (__v64qi)_mm512_setzero_qi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1537,55 +1521,49 @@ _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi8_epi16 (__m256i __A) +_mm512_cvtepi8_epi16(__m256i __A) { - return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) -1); + /* This function always performs a signed extension, but __v32qi is a char + which may be signed or unsigned, so use __v32qs. */ + return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) +_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtepi8_epi16(__A), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A) +_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, - (__v32hi) - _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtepi8_epi16(__A), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu8_epi16 (__m256i __A) +_mm512_cvtepu8_epi16(__m256i __A) { - return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) -1); + return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) +_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtepu8_epi16(__A), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A) +_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, - (__v32hi) - _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtepu8_epi16(__A), + (__v32hi)_mm512_setzero_hi()); } @@ -1704,79 +1682,70 @@ _mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A) (__v32hi)_mm512_setzero_hi()); }) static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sllv_epi16 (__m512i __A, __m512i __B) +_mm512_sllv_epi16(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) -1); + return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sllv_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sllv_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sll_epi16 (__m512i __A, __m128i __B) +_mm512_sll_epi16(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) -1); + return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) +_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sll_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sll_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } -#define _mm512_slli_epi16(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_slli_epi16(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B); +} -#define _mm512_mask_slli_epi16(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)(__m512i)(W), \ - (__mmask32)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_slli_epi16(__A, __B), + (__v32hi)__W); +} -#define _mm512_maskz_slli_epi16(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_slli_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); +} #define _mm512_bslli_epi128(a, imm) __extension__ ({ \ (__m512i)__builtin_shufflevector( \ @@ -1848,155 +1817,136 @@ _mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B) ((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); }) static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srlv_epi16 (__m512i __A, __m512i __B) +_mm512_srlv_epi16(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) -1); + return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srlv_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srlv_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srav_epi16 (__m512i __A, __m512i __B) +_mm512_srav_epi16(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) -1); + return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srav_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srav_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sra_epi16 (__m512i __A, __m128i __B) +_mm512_sra_epi16(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) -1); + return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) +_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sra_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sra_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } -#define _mm512_srai_epi16(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)-1); }) - -#define _mm512_mask_srai_epi16(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)(__m512i)(W), \ - (__mmask32)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srai_epi16(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B); +} -#define _mm512_maskz_srai_epi16(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srai_epi16(__A, __B), + (__v32hi)__W); +} +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srai_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); +} static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srl_epi16 (__m512i __A, __m128i __B) +_mm512_srl_epi16(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) -1); + return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) +_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srl_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, - (__v8hi) __B, - (__v32hi) - _mm512_setzero_hi (), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srl_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); } -#define _mm512_srli_epi16(A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srli_epi16(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B); +} -#define _mm512_mask_srli_epi16(W, U, A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)(__m512i)(W), \ - (__mmask32)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srli_epi16(__A, __B), + (__v32hi)__W); +} -#define _mm512_maskz_srli_epi16(U, A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srli_epi16(__A, __B), + (__v32hi)_mm512_setzero_hi()); +} #define _mm512_bsrli_epi128(a, imm) __extension__ ({ \ (__m512i)__builtin_shufflevector( \ diff --git a/lib/Headers/avx512dqintrin.h b/lib/Headers/avx512dqintrin.h index 13665e4c6668..ae44b98a9495 100644 --- a/lib/Headers/avx512dqintrin.h +++ b/lib/Headers/avx512dqintrin.h @@ -37,204 +37,169 @@ _mm512_mullo_epi64 (__m512i __A, __m512i __B) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); +_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_mullo_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); +_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_mullo_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_xor_pd (__m512d __A, __m512d __B) { - return (__m512d) ((__v8du) __A ^ (__v8du) __B); +_mm512_xor_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8du)__A ^ (__v8du)__B); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); +_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_xor_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); +_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_xor_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_xor_ps (__m512 __A, __m512 __B) { - return (__m512) ((__v16su) __A ^ (__v16su) __B); + return (__m512)((__v16su)__A ^ (__v16su)__B); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); +_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_xor_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); +_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_xor_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_or_pd (__m512d __A, __m512d __B) { - return (__m512d) ((__v8du) __A | (__v8du) __B); +_mm512_or_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8du)__A | (__v8du)__B); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); +_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_or_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); +_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_or_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_or_ps (__m512 __A, __m512 __B) { - return (__m512) ((__v16su) __A | (__v16su) __B); +_mm512_or_ps(__m512 __A, __m512 __B) { + return (__m512)((__v16su)__A | (__v16su)__B); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); +_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_or_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); +_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_or_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_and_pd (__m512d __A, __m512d __B) { - return (__m512d) ((__v8du) __A & (__v8du) __B); +_mm512_and_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8du)__A & (__v8du)__B); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); +_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_and_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); +_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_and_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_and_ps (__m512 __A, __m512 __B) { - return (__m512) ((__v16su) __A & (__v16su) __B); +_mm512_and_ps(__m512 __A, __m512 __B) { + return (__m512)((__v16su)__A & (__v16su)__B); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); +_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_and_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); +_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_and_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_andnot_pd (__m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); +_mm512_andnot_pd(__m512d __A, __m512d __B) { + return (__m512d)(~(__v8du)__A & (__v8du)__B); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U); +_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_andnot_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); +_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_andnot_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_andnot_ps (__m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1); +_mm512_andnot_ps(__m512 __A, __m512 __B) { + return (__m512)(~(__v16su)__A & (__v16su)__B); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U); +_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_andnot_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); +_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_andnot_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1151,148 +1116,184 @@ _mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) } #define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \ - (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__m256)__builtin_shufflevector((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + ((imm) & 1) ? 8 : 0, \ + ((imm) & 1) ? 9 : 1, \ + ((imm) & 1) ? 10 : 2, \ + ((imm) & 1) ? 11 : 3, \ + ((imm) & 1) ? 12 : 4, \ + ((imm) & 1) ? 13 : 5, \ + ((imm) & 1) ? 14 : 6, \ + ((imm) & 1) ? 15 : 7); }) #define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \ - (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \ + (__v8sf)(W)); }) #define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \ - (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \ + (__v8sf)_mm256_setzero_ps()); }) #define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1); }) + (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A), \ + (__v8df)_mm512_undefined_pd(), \ + 0 + ((imm) & 0x3) * 2, \ + 1 + ((imm) & 0x3) * 2); }) #define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U)); }) + (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm512_extractf64x2_pd((A), (imm)), \ + (__v2df)(W)); }) #define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)); }) + (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm512_extractf64x2_pd((A), (imm)), \ + (__v2df)_mm_setzero_pd()); }) #define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A), \ + (__v16si)_mm512_undefined_epi32(), \ + ((imm) & 1) ? 8 : 0, \ + ((imm) & 1) ? 9 : 1, \ + ((imm) & 1) ? 10 : 2, \ + ((imm) & 1) ? 11 : 3, \ + ((imm) & 1) ? 12 : 4, \ + ((imm) & 1) ? 13 : 5, \ + ((imm) & 1) ? 14 : 6, \ + ((imm) & 1) ? 15 : 7); }) #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \ + (__v8si)(W)); }) #define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \ + (__v8si)_mm256_setzero_si256()); }) #define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ - (int)(imm), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)-1); }) + (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)_mm512_undefined_epi32(), \ + 0 + ((imm) & 0x3) * 2, \ + 1 + ((imm) & 0x3) * 2); }) #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ - (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \ + (__v2di)(W)); }) #define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ - (int)(imm), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)(U)); }) + (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \ + (__v2di)_mm_setzero_di()); }) #define _mm512_insertf32x8(A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1); }) + (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_castps256_ps512((__m256)(B)),\ + ((imm) & 0x1) ? 0 : 16, \ + ((imm) & 0x1) ? 1 : 17, \ + ((imm) & 0x1) ? 2 : 18, \ + ((imm) & 0x1) ? 3 : 19, \ + ((imm) & 0x1) ? 4 : 20, \ + ((imm) & 0x1) ? 5 : 21, \ + ((imm) & 0x1) ? 6 : 22, \ + ((imm) & 0x1) ? 7 : 23, \ + ((imm) & 0x1) ? 16 : 8, \ + ((imm) & 0x1) ? 17 : 9, \ + ((imm) & 0x1) ? 18 : 10, \ + ((imm) & 0x1) ? 19 : 11, \ + ((imm) & 0x1) ? 20 : 12, \ + ((imm) & 0x1) ? 21 : 13, \ + ((imm) & 0x1) ? 22 : 14, \ + ((imm) & 0x1) ? 23 : 15); }) #define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ + (__v16sf)(W)); }) #define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_insertf64x2(A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1); }) + (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ + (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\ + (((imm) & 0x3) == 0) ? 8 : 0, \ + (((imm) & 0x3) == 0) ? 9 : 1, \ + (((imm) & 0x3) == 1) ? 8 : 2, \ + (((imm) & 0x3) == 1) ? 9 : 3, \ + (((imm) & 0x3) == 2) ? 8 : 4, \ + (((imm) & 0x3) == 2) ? 9 : 5, \ + (((imm) & 0x3) == 3) ? 8 : 6, \ + (((imm) & 0x3) == 3) ? 9 : 7); }) #define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(imm), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ + (__v8df)(W)); }) #define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_inserti32x8(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \ - (__v8si)(__m256i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) + (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ + (__v16si)_mm512_castsi256_si512((__m256i)(B)),\ + ((imm) & 0x1) ? 0 : 16, \ + ((imm) & 0x1) ? 1 : 17, \ + ((imm) & 0x1) ? 2 : 18, \ + ((imm) & 0x1) ? 3 : 19, \ + ((imm) & 0x1) ? 4 : 20, \ + ((imm) & 0x1) ? 5 : 21, \ + ((imm) & 0x1) ? 6 : 22, \ + ((imm) & 0x1) ? 7 : 23, \ + ((imm) & 0x1) ? 16 : 8, \ + ((imm) & 0x1) ? 17 : 9, \ + ((imm) & 0x1) ? 18 : 10, \ + ((imm) & 0x1) ? 19 : 11, \ + ((imm) & 0x1) ? 20 : 12, \ + ((imm) & 0x1) ? 21 : 13, \ + ((imm) & 0x1) ? 22 : 14, \ + ((imm) & 0x1) ? 23 : 15); }) #define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \ - (__v8si)(__m256i)(B), (int)(imm), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ + (__v16si)(W)); }) #define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \ - (__v8si)(__m256i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512()); }) #define _mm512_inserti64x2(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \ - (__v2di)(__m128i)(B), \ - (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)_mm512_castsi128_si512((__m128i)(B)),\ + (((imm) & 0x3) == 0) ? 8 : 0, \ + (((imm) & 0x3) == 0) ? 9 : 1, \ + (((imm) & 0x3) == 1) ? 8 : 2, \ + (((imm) & 0x3) == 1) ? 9 : 3, \ + (((imm) & 0x3) == 2) ? 8 : 4, \ + (((imm) & 0x3) == 2) ? 9 : 5, \ + (((imm) & 0x3) == 3) ? 8 : 6, \ + (((imm) & 0x3) == 3) ? 9 : 7); }) #define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \ - (__v2di)(__m128i)(B), \ - (int)(imm), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ + (__v8di)(W)); }) #define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \ - (__v2di)(__m128i)(B), \ - (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512()); }) #define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h index 0bf6582345d4..e6a7217c8967 100644 --- a/lib/Headers/avx512fintrin.h +++ b/lib/Headers/avx512fintrin.h @@ -54,6 +54,19 @@ typedef unsigned short __mmask16; #define _MM_FROUND_TO_ZERO 0x03 #define _MM_FROUND_CUR_DIRECTION 0x04 +/* Constants for integer comparison predicates */ +typedef enum { + _MM_CMPINT_EQ, /* Equal */ + _MM_CMPINT_LT, /* Less than */ + _MM_CMPINT_LE, /* Less than or Equal */ + _MM_CMPINT_UNUSED, + _MM_CMPINT_NE, /* Not Equal */ + _MM_CMPINT_NLT, /* Not Less than */ +#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */ + _MM_CMPINT_NLE /* Not Less than or Equal */ +#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */ +} _MM_CMPINT_ENUM; + typedef enum { _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, @@ -503,6 +516,18 @@ _mm512_castsi512_si256 (__m512i __A) return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); } +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_int2mask(int __a) +{ + return (__mmask16)__a; +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask2int(__mmask16 __a) +{ + return (int)__a; +} + /* Bitwise operators */ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_and_epi32(__m512i __a, __m512i __b) @@ -737,22 +762,19 @@ _mm512_add_epi64 (__m512i __A, __m512i __B) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_add_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_add_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -762,22 +784,19 @@ _mm512_sub_epi64 (__m512i __A, __m512i __B) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sub_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sub_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -787,22 +806,19 @@ _mm512_add_epi32 (__m512i __A, __m512i __B) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_add_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_add_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -812,22 +828,19 @@ _mm512_sub_epi32 (__m512i __A, __m512i __B) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sub_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sub_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } #define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \ @@ -1403,57 +1416,45 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) static __inline __m512i __DEFAULT_FN_ATTRS _mm512_mul_epi32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) __W, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epi32(__X, __Y), + (__v8di)__W); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epi32(__X, __Y), + (__v8di)_mm512_setzero_si512 ()); } static __inline __m512i __DEFAULT_FN_ATTRS _mm512_mul_epu32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) __W, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epu32(__X, __Y), + (__v8di)__W); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y) +_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, - (__v16si) __Y, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epu32(__X, __Y), + (__v8di)_mm512_setzero_si512 ()); } static __inline __m512i __DEFAULT_FN_ATTRS @@ -1463,21 +1464,19 @@ _mm512_mullo_epi32 (__m512i __A, __m512i __B) } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_mullo_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_mullo_epi32(__A, __B), + (__v16si)__W); } #define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \ @@ -1977,38 +1976,30 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_add_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_add_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_add_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_add_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } #define _mm512_add_round_pd(A, B, R) __extension__ ({ \ @@ -2120,40 +2111,30 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_sub_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_sub_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_sub_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_sub_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } #define _mm512_sub_round_pd(A, B, R) __extension__ ({ \ @@ -2265,40 +2246,30 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_mul_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_mul_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_mul_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_mul_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } #define _mm512_mul_round_pd(A, B, R) __extension__ ({ \ @@ -2417,21 +2388,16 @@ _mm512_div_pd(__m512d __a, __m512d __b) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_div_pd(__A, __B), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_div_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } static __inline __m512 __DEFAULT_FN_ATTRS @@ -2442,21 +2408,16 @@ _mm512_div_ps(__m512 __a, __m512 __b) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_div_ps(__A, __B), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_div_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } #define _mm512_div_round_pd(A, B, R) __extension__ ({ \ @@ -3443,71 +3404,94 @@ _mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A, } #define _mm512_alignr_epi64(A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(I), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(A), \ + ((int)(I) & 0x7) + 0, \ + ((int)(I) & 0x7) + 1, \ + ((int)(I) & 0x7) + 2, \ + ((int)(I) & 0x7) + 3, \ + ((int)(I) & 0x7) + 4, \ + ((int)(I) & 0x7) + 5, \ + ((int)(I) & 0x7) + 6, \ + ((int)(I) & 0x7) + 7); }) #define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\ - (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)(__m512i)(W)); }) #define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\ - (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512()); }) #define _mm512_alignr_epi32(A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(I), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) + (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(A), \ + ((int)(I) & 0xf) + 0, \ + ((int)(I) & 0xf) + 1, \ + ((int)(I) & 0xf) + 2, \ + ((int)(I) & 0xf) + 3, \ + ((int)(I) & 0xf) + 4, \ + ((int)(I) & 0xf) + 5, \ + ((int)(I) & 0xf) + 6, \ + ((int)(I) & 0xf) + 7, \ + ((int)(I) & 0xf) + 8, \ + ((int)(I) & 0xf) + 9, \ + ((int)(I) & 0xf) + 10, \ + ((int)(I) & 0xf) + 11, \ + ((int)(I) & 0xf) + 12, \ + ((int)(I) & 0xf) + 13, \ + ((int)(I) & 0xf) + 14, \ + ((int)(I) & 0xf) + 15); }) #define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\ - (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)(__m512i)(W)); }) #define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\ - (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512()); }) /* Vector Extract */ -#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ - (__v4df)_mm256_setzero_si256(), \ - (__mmask8)-1); }) +#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \ + (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A), \ + (__v8df)_mm512_undefined_pd(), \ + ((I) & 1) ? 4 : 0, \ + ((I) & 1) ? 5 : 1, \ + ((I) & 1) ? 6 : 2, \ + ((I) & 1) ? 7 : 3); }) #define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm512_extractf64x4_pd((A), (imm)), \ + (__v4df)(W)); }) #define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm512_extractf64x4_pd((A), (imm)), \ + (__v4df)_mm256_setzero_pd()); }) -#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1); }) +#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \ + (__m128)__builtin_shufflevector((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + 0 + ((I) & 0x3) * 4, \ + 1 + ((I) & 0x3) * 4, \ + 2 + ((I) & 0x3) * 4, \ + 3 + ((I) & 0x3) * 4); }) #define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U)); }) + (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \ + (__v4sf)(W)); }) #define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)); }) + (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \ + (__v4sf)_mm_setzero_ps()); }) + /* Vector Blend */ static __inline __m512d __DEFAULT_FN_ATTRS @@ -3556,10 +3540,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) #define _mm512_cmp_ps_mask(A, B, P) \ _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) - #define _mm512_mask_cmp_ps_mask(U, A, B, P) \ _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) +#define _mm512_cmpeq_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ) +#define _mm512_mask_cmpeq_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ) + +#define _mm512_cmplt_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS) +#define _mm512_mask_cmplt_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS) + +#define _mm512_cmple_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS) +#define _mm512_mask_cmple_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS) + +#define _mm512_cmpunord_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q) +#define _mm512_mask_cmpunord_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q) + +#define _mm512_cmpneq_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ) +#define _mm512_mask_cmpneq_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ) + +#define _mm512_cmpnlt_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US) +#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US) + +#define _mm512_cmpnle_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US) +#define _mm512_mask_cmpnle_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US) + +#define _mm512_cmpord_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q) +#define _mm512_mask_cmpord_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) + #define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \ (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(P), \ @@ -3572,10 +3595,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) #define _mm512_cmp_pd_mask(A, B, P) \ _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) - #define _mm512_mask_cmp_pd_mask(U, A, B, P) \ _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) +#define _mm512_cmpeq_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ) +#define _mm512_mask_cmpeq_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ) + +#define _mm512_cmplt_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS) +#define _mm512_mask_cmplt_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS) + +#define _mm512_cmple_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS) +#define _mm512_mask_cmple_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS) + +#define _mm512_cmpunord_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q) +#define _mm512_mask_cmpunord_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q) + +#define _mm512_cmpneq_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ) +#define _mm512_mask_cmpneq_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ) + +#define _mm512_cmpnlt_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US) +#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US) + +#define _mm512_cmpnle_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US) +#define _mm512_mask_cmpnle_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US) + +#define _mm512_cmpord_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q) +#define _mm512_mask_cmpord_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q) + /* Conversion */ #define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \ @@ -3682,26 +3744,35 @@ _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) static __inline __m512d __DEFAULT_FN_ATTRS _mm512_cvtepi32_pd(__m256i __A) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepi32_pd(__A), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) { - return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepi32_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS +_mm512_cvtepi32lo_pd(__m512i __A) +{ + return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS +_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) +{ + return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); } static __inline__ __m512 __DEFAULT_FN_ATTRS @@ -3734,26 +3805,35 @@ _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) static __inline __m512d __DEFAULT_FN_ATTRS _mm512_cvtepu32_pd(__m256i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) __W, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepu32_pd(__A), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) { - return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepu32_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS +_mm512_cvtepu32lo_pd(__m512i __A) +{ + return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS +_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) +{ + return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); } #define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \ @@ -3798,6 +3878,24 @@ _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } +static __inline__ __m512 __DEFAULT_FN_ATTRS +_mm512_cvtpd_pslo (__m512d __A) +{ + return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), + (__v8sf) _mm256_setzero_ps (), + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS +_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) +{ + return (__m512) __builtin_shufflevector ( + (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), + __U, __A), + (__v8sf) _mm256_setzero_ps (), + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + #define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ (__v16hi)_mm256_undefined_si256(), \ @@ -4919,263 +5017,227 @@ _mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi8_epi32 (__m128i __A) +_mm512_cvtepi8_epi32(__m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi8_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A) +_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi8_epi32(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi8_epi64 (__m128i __A) +_mm512_cvtepi8_epi64(__m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi8_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi8_epi64(__A), + (__v8di)_mm512_setzero_si512 ()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi32_epi64 (__m256i __X) +_mm512_cvtepi32_epi64(__m256i __X) { - return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { - return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi32_epi64(__X), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X) +_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) { - return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi32_epi64(__X), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi16_epi32 (__m256i __A) +_mm512_cvtepi16_epi32(__m256i __A) { - return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi16_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A) +_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi16_epi32(__A), + (__v16si)_mm512_setzero_si512 ()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepi16_epi64 (__m128i __A) +_mm512_cvtepi16_epi64(__m128i __A) { - return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi16_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi16_epi64(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu8_epi32 (__m128i __A) +_mm512_cvtepu8_epi32(__m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu8_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A) +_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu8_epi32(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu8_epi64 (__m128i __A) +_mm512_cvtepu8_epi64(__m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu8_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu8_epi64(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu32_epi64 (__m256i __X) +_mm512_cvtepu32_epi64(__m256i __X) { - return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { - return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu32_epi64(__X), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X) +_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) { - return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu32_epi64(__X), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu16_epi32 (__m256i __A) +_mm512_cvtepu16_epi32(__m256i __A) { - return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu16_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A) +_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) { - return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu16_epi32(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_cvtepu16_epi64 (__m128i __A) +_mm512_cvtepu16_epi64(__m128i __A) { - return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu16_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu16_epi64(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -5393,67 +5455,91 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512(), \ (__mmask8)(U)); }) -#define _mm512_slli_epi32(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) - -#define _mm512_mask_slli_epi32(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) - -#define _mm512_maskz_slli_epi32(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_slli_epi32(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B); +} -#define _mm512_slli_epi64(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_slli_epi32(__A, __B), + (__v16si)__W); +} -#define _mm512_mask_slli_epi64(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_slli_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} -#define _mm512_maskz_slli_epi64(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_slli_epi64(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B); +} +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_slli_epi64(__A, __B), + (__v8di)__W); +} +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_slli_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} -#define _mm512_srli_epi32(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srli_epi32(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B); +} -#define _mm512_mask_srli_epi32(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srli_epi32(__A, __B), + (__v16si)__W); +} -#define _mm512_maskz_srli_epi32(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srli_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} -#define _mm512_srli_epi64(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srli_epi64(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B); +} -#define _mm512_mask_srli_epi64(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srli_epi64(__A, __B), + (__v8di)__W); +} -#define _mm512_maskz_srli_epi64(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srli_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) @@ -5911,8 +5997,10 @@ _mm512_kmov (__mmask16 __A) (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ (int)(P), (int)(R)); }) +#ifdef __x86_64__ #define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); }) +#endif static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, @@ -5926,351 +6014,267 @@ _mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sll_epi32 (__m512i __A, __m128i __B) +_mm512_sll_epi32(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sll_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sll_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sll_epi64 (__m512i __A, __m128i __B) +_mm512_sll_epi64(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sll_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sll_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sllv_epi32 (__m512i __X, __m512i __Y) +_mm512_sllv_epi32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sllv_epi32(__X, __Y), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sllv_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sllv_epi64 (__m512i __X, __m512i __Y) +_mm512_sllv_epi64(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sllv_epi64(__X, __Y), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sllv_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sra_epi32 (__m512i __A, __m128i __B) +_mm512_sra_epi32(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sra_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sra_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_sra_epi64 (__m512i __A, __m128i __B) +_mm512_sra_epi64(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sra_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sra_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srav_epi32 (__m512i __X, __m512i __Y) +_mm512_srav_epi32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srav_epi32(__X, __Y), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srav_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srav_epi64 (__m512i __X, __m512i __Y) +_mm512_srav_epi64(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srav_epi64(__X, __Y), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srav_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srl_epi32 (__m512i __A, __m128i __B) +_mm512_srl_epi32(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srl_epi32(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, - (__v4si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srl_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srl_epi64 (__m512i __A, __m128i __B) +_mm512_srl_epi64(__m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srl_epi64(__A, __B), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, - (__v2di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srl_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_srlv_epi32 (__m512i __X, __m512i __Y) +_mm512_srlv_epi32(__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srlv_epi32(__X, __Y), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, - (__v16si) __Y, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srlv_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_srlv_epi64 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srlv_epi64(__X, __Y), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srlv_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); } #define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \ @@ -6309,8 +6313,10 @@ _mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)(__m512i)(C), (int)(imm), \ (__mmask8)(U)); }) +#ifdef __x86_64__ #define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); }) +#endif #define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); }) @@ -6328,6 +6334,7 @@ _mm_cvtsd_u32 (__m128d __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \ (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ (int)(R)); }) @@ -6339,6 +6346,7 @@ _mm_cvtsd_u64 (__m128d __A) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvt_roundss_si32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); }) @@ -6346,11 +6354,13 @@ _mm_cvtsd_u64 (__m128d __A) #define _mm_cvt_roundss_i32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); }) +#ifdef __x86_64__ #define _mm_cvt_roundss_si64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); }) #define _mm_cvt_roundss_i64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); }) +#endif #define _mm_cvt_roundss_u32(A, R) __extension__ ({ \ (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); }) @@ -6362,6 +6372,7 @@ _mm_cvtss_u32 (__m128 __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvt_roundss_u64(A, R) __extension__ ({ \ (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ (int)(R)); }) @@ -6373,6 +6384,7 @@ _mm_cvtss_u64 (__m128 __A) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); }) @@ -6387,6 +6399,7 @@ _mm_cvttsd_i32 (__m128d __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); }) @@ -6399,6 +6412,7 @@ _mm_cvttsd_i64 (__m128d __A) return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \ (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); }) @@ -6410,6 +6424,7 @@ _mm_cvttsd_u32 (__m128d __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \ (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ (int)(R)); }) @@ -6421,6 +6436,7 @@ _mm_cvttsd_u64 (__m128d __A) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \ (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); }) @@ -6435,6 +6451,7 @@ _mm_cvttss_i32 (__m128 __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \ (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); }) @@ -6447,6 +6464,7 @@ _mm_cvttss_i64 (__m128 __A) return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \ (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); }) @@ -6458,6 +6476,7 @@ _mm_cvttss_u32 (__m128 __A) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \ (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ (int)(R)); }) @@ -6469,6 +6488,7 @@ _mm_cvttss_u64 (__m128 __A) __A, _MM_FROUND_CUR_DIRECTION); } +#endif static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U, @@ -6556,61 +6576,47 @@ _mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I, (__v16sf)_mm512_setzero_ps()); }) static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_permutevar_pd (__m512d __A, __m512i __C) +_mm512_permutevar_pd(__m512d __A, __m512i __C) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) +_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) __W, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutevar_pd(__A, __C), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C) +_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) { - return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, - (__v8di) __C, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutevar_pd(__A, __C), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_permutevar_ps (__m512 __A, __m512i __C) +_mm512_permutevar_ps(__m512 __A, __m512i __C) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) +_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) __W, - (__mmask16) __U); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutevar_ps(__A, __C), + (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C) +_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) { - return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, - (__v16si) __C, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutevar_ps(__A, __C), + (__v16sf)_mm512_setzero_ps()); } static __inline __m512d __DEFAULT_FN_ATTRS @@ -7028,35 +7034,48 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) (__mmask8)(U), \ _MM_FROUND_CUR_DIRECTION); }) -#define _mm512_srai_epi32(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srai_epi32(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B); +} -#define _mm512_mask_srai_epi32(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \ + (__v16si)_mm512_srai_epi32(__A, __B), \ + (__v16si)__W); +} -#define _mm512_maskz_srai_epi32(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \ + (__v16si)_mm512_srai_epi32(__A, __B), \ + (__v16si)_mm512_setzero_si512()); +} -#define _mm512_srai_epi64(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_srai_epi64(__m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B); +} -#define _mm512_mask_srai_epi64(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \ + (__v8di)_mm512_srai_epi64(__A, __B), \ + (__v8di)__W); +} -#define _mm512_maskz_srai_epi64(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \ + (__v8di)_mm512_srai_epi64(__A, __B), \ + (__v8di)_mm512_setzero_si512()); +} #define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \ (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \ @@ -7832,107 +7851,145 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); } -#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ - (__mmask8)-1); }) +#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \ + (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A), \ + (__v16si)_mm512_undefined_epi32(), \ + 0 + ((imm) & 0x3) * 4, \ + 1 + ((imm) & 0x3) * 4, \ + 2 + ((imm) & 0x3) * 4, \ + 3 + ((imm) & 0x3) * 4); }) #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \ + (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ + (__v4si)__W); }) #define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \ + (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ + (__v4si)_mm_setzero_si128()); }) -#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_undefined_si256(), \ - (__mmask8)-1); }) +#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \ + (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)_mm512_undefined_epi32(), \ + ((imm) & 1) ? 4 : 0, \ + ((imm) & 1) ? 5 : 1, \ + ((imm) & 1) ? 6 : 2, \ + ((imm) & 1) ? 7 : 3); }) #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ + (__v4di)__W); }) #define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ + (__v4di)_mm256_setzero_si256()); }) #define _mm512_insertf64x4(A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \ - (__v4df)(__m256d)(B), (int)(imm), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1); }) + (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ + (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \ + ((imm) & 0x1) ? 0 : 8, \ + ((imm) & 0x1) ? 1 : 9, \ + ((imm) & 0x1) ? 2 : 10, \ + ((imm) & 0x1) ? 3 : 11, \ + ((imm) & 0x1) ? 8 : 4, \ + ((imm) & 0x1) ? 9 : 5, \ + ((imm) & 0x1) ? 10 : 6, \ + ((imm) & 0x1) ? 11 : 7); }) #define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \ - (__v4df)(__m256d)(B), (int)(imm), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)(W)); }) #define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \ - (__v4df)(__m256d)(B), (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_inserti64x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)_mm512_castsi256_si512((__m256i)(B)), \ + ((imm) & 0x1) ? 0 : 8, \ + ((imm) & 0x1) ? 1 : 9, \ + ((imm) & 0x1) ? 2 : 10, \ + ((imm) & 0x1) ? 3 : 11, \ + ((imm) & 0x1) ? 8 : 4, \ + ((imm) & 0x1) ? 9 : 5, \ + ((imm) & 0x1) ? 10 : 6, \ + ((imm) & 0x1) ? 11 : 7); }) #define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)(W)); }) #define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512()); }) #define _mm512_insertf32x4(A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1); }) + (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_castps128_ps512((__m128)(B)),\ + (((imm) & 0x3) == 0) ? 16 : 0, \ + (((imm) & 0x3) == 0) ? 17 : 1, \ + (((imm) & 0x3) == 0) ? 18 : 2, \ + (((imm) & 0x3) == 0) ? 19 : 3, \ + (((imm) & 0x3) == 1) ? 16 : 4, \ + (((imm) & 0x3) == 1) ? 17 : 5, \ + (((imm) & 0x3) == 1) ? 18 : 6, \ + (((imm) & 0x3) == 1) ? 19 : 7, \ + (((imm) & 0x3) == 2) ? 16 : 8, \ + (((imm) & 0x3) == 2) ? 17 : 9, \ + (((imm) & 0x3) == 2) ? 18 : 10, \ + (((imm) & 0x3) == 2) ? 19 : 11, \ + (((imm) & 0x3) == 3) ? 16 : 12, \ + (((imm) & 0x3) == 3) ? 17 : 13, \ + (((imm) & 0x3) == 3) ? 18 : 14, \ + (((imm) & 0x3) == 3) ? 19 : 15); }) #define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)(W)); }) #define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_inserti32x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) + (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ + (__v16si)_mm512_castsi128_si512((__m128i)(B)),\ + (((imm) & 0x3) == 0) ? 16 : 0, \ + (((imm) & 0x3) == 0) ? 17 : 1, \ + (((imm) & 0x3) == 0) ? 18 : 2, \ + (((imm) & 0x3) == 0) ? 19 : 3, \ + (((imm) & 0x3) == 1) ? 16 : 4, \ + (((imm) & 0x3) == 1) ? 17 : 5, \ + (((imm) & 0x3) == 1) ? 18 : 6, \ + (((imm) & 0x3) == 1) ? 19 : 7, \ + (((imm) & 0x3) == 2) ? 16 : 8, \ + (((imm) & 0x3) == 2) ? 17 : 9, \ + (((imm) & 0x3) == 2) ? 18 : 10, \ + (((imm) & 0x3) == 2) ? 19 : 11, \ + (((imm) & 0x3) == 3) ? 16 : 12, \ + (((imm) & 0x3) == 3) ? 17 : 13, \ + (((imm) & 0x3) == 3) ? 18 : 14, \ + (((imm) & 0x3) == 3) ? 19 : 15); }) #define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)(W)); }) #define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512()); }) #define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \ (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ @@ -8275,17 +8332,17 @@ __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A, + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, (__v4sf) __B, - (__v4sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -8323,17 +8380,17 @@ _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A, + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, -(__v4sf) __B, - (__v4sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -8355,33 +8412,33 @@ _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, (__v4sf) __X, - -(__v4sf) __Y, + (__v4sf) __Y, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ + (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(X), \ - -(__v4sf)(__m128)(Y), (__mmask8)(U), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A, + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, (__v4sf) __B, - (__v4sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -8419,17 +8476,17 @@ _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A, + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, -(__v4sf) __B, - (__v4sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -8451,33 +8508,33 @@ _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W, + return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W, (__v4sf) __X, - -(__v4sf) __Y, + (__v4sf) __Y, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\ - (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \ + (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(X), \ - -(__v4sf)(__m128)(Y), (__mmask8)(U), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A, + return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, + (__v2df) __A, (__v2df) __B, - (__v2df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -8515,17 +8572,17 @@ _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A, + return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, + (__v2df) __A, -(__v2df) __B, - (__v2df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -8547,33 +8604,33 @@ _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, (__v2df) __X, - -(__v2df) __Y, + (__v2df) __Y, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ + (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ (__v2df)(__m128d)(X), \ - -(__v2df)(__m128d)(Y), \ + (__v2df)(__m128d)(Y), \ (__mmask8)(U), (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A, + return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, + -(__v2df) __A, (__v2df) __B, - (__v2df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -8611,17 +8668,17 @@ _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A, + return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, + -(__v2df) __A, -(__v2df) __B, - (__v2df) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), (__mmask8)(U), \ (int)(R)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -8644,17 +8701,17 @@ _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) (__W), + return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W), (__v2df) __X, - -(__v2df) (__Y), + (__v2df) (__Y), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \ + (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \ (__v2df)(__m128d)(X), \ - -(__v2df)(__m128d)(Y), \ + (__v2df)(__m128d)(Y), \ (__mmask8)(U), (int)(R)); }) #define _mm512_permutex_pd(X, C) __extension__ ({ \ @@ -9041,6 +9098,101 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) (__v16sf)_mm512_setzero_ps()); } +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + __m128 res = __A; + res[0] = (__U & 1) ? __B[0] : __W[0]; + return res; +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + __m128 res = __A; + res[0] = (__U & 1) ? __B[0] : 0; + return res; +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + __m128d res = __A; + res[0] = (__U & 1) ? __B[0] : __W[0]; + return res; +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + __m128d res = __A; + res[0] = (__U & 1) ? __B[0] : 0; + return res; +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storess128_mask ((__v16sf *)__W, + (__v16sf) _mm512_castps128_ps512(__A), + (__mmask16) __U & (__mmask16)1); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storesd128_mask ((__v8df *)__W, + (__v8df) _mm512_castpd128_pd512(__A), + (__mmask8) __U & 1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) +{ + __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, + (__v4sf) {0.0, 0.0, 0.0, 0.0}, + 0, 4, 4, 4); + + return (__m128) __builtin_shufflevector( + __builtin_ia32_loadss128_mask ((__v16sf *) __A, + (__v16sf) _mm512_castps128_ps512(src), + (__mmask16) __U & 1), + _mm512_undefined_ps(), 0, 1, 2, 3); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_maskz_load_ss (__mmask8 __U, const float* __A) +{ + return (__m128) __builtin_shufflevector( + __builtin_ia32_loadss128_mask ((__v16sf *) __A, + (__v16sf) _mm512_setzero_ps(), + (__mmask16) __U & 1), + _mm512_undefined_ps(), 0, 1, 2, 3); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) +{ + __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, + (__v2df) {0.0, 0.0}, 0, 2); + + return (__m128d) __builtin_shufflevector( + __builtin_ia32_loadsd128_mask ((__v8df *) __A, + (__v8df) _mm512_castpd128_pd512(src), + (__mmask8) __U & 1), + _mm512_undefined_pd(), 0, 1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_maskz_load_sd (__mmask8 __U, const double* __A) +{ + return (__m128d) __builtin_shufflevector( + __builtin_ia32_loadsd128_mask ((__v8df *) __A, + (__v8df) _mm512_setzero_pd(), + (__mmask8) __U & 1), + _mm512_undefined_pd(), 0, 1); +} + #define _mm512_shuffle_epi32(A, I) __extension__ ({ \ (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ (__v16si)_mm512_undefined_epi32(), \ @@ -9243,6 +9395,18 @@ _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) _MM_FROUND_CUR_DIRECTION); } +static __inline__ __m512 __DEFAULT_FN_ATTRS +_mm512_cvtpslo_pd (__m512 __A) +{ + return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS +_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) +{ + return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); +} + static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) { @@ -9340,14 +9504,17 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) } #define _mm_cvtss_i32 _mm_cvtss_si32 -#define _mm_cvtss_i64 _mm_cvtss_si64 #define _mm_cvtsd_i32 _mm_cvtsd_si32 -#define _mm_cvtsd_i64 _mm_cvtsd_si64 #define _mm_cvti32_sd _mm_cvtsi32_sd -#define _mm_cvti64_sd _mm_cvtsi64_sd #define _mm_cvti32_ss _mm_cvtsi32_ss +#ifdef __x86_64__ +#define _mm_cvtss_i64 _mm_cvtss_si64 +#define _mm_cvtsd_i64 _mm_cvtsd_si64 +#define _mm_cvti64_sd _mm_cvtsi64_sd #define _mm_cvti64_ss _mm_cvtsi64_ss +#endif +#ifdef __x86_64__ #define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \ (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ (int)(R)); }) @@ -9355,6 +9522,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) #define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \ (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ (int)(R)); }) +#endif #define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); }) @@ -9362,6 +9530,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) #define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); }) +#ifdef __x86_64__ #define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ (int)(R)); }) @@ -9369,6 +9538,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) #define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ (int)(R)); }) +#endif #define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \ (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ @@ -9412,6 +9582,7 @@ _mm_cvtu32_sd (__m128d __A, unsigned __B) return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); } +#ifdef __x86_64__ #define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \ (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ (unsigned long long)(B), (int)(R)); }) @@ -9422,6 +9593,7 @@ _mm_cvtu64_sd (__m128d __A, unsigned long long __B) return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, _MM_FROUND_CUR_DIRECTION); } +#endif #define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ @@ -9434,6 +9606,7 @@ _mm_cvtu32_ss (__m128 __A, unsigned __B) _MM_FROUND_CUR_DIRECTION); } +#ifdef __x86_64__ #define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \ (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ (unsigned long long)(B), (int)(R)); }) @@ -9444,6 +9617,7 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B) return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, _MM_FROUND_CUR_DIRECTION); } +#endif static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) @@ -9452,12 +9626,14 @@ _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) __M); } +#ifdef __x86_64__ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) { return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, __M); } +#endif static __inline __m512i __DEFAULT_FN_ATTRS _mm512_set_epi32 (int __A, int __B, int __C, int __D, @@ -9514,27 +9690,553 @@ _mm512_set_ps (float __A, float __B, float __C, float __D, (e4),(e3),(e2),(e1),(e0)) static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_abs_ps(__m512 A) +_mm512_abs_ps(__m512 __A) { - return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ; + return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_abs_ps(__m512 W, __mmask16 K, __m512 A) +_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) { - return (__m512)_mm512_mask_and_epi32((__m512i)W, K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ; + return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_abs_pd(__m512d A) +_mm512_abs_pd(__m512d __A) { - return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A) ; + return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_abs_pd(__m512d W, __mmask8 K, __m512d A) -{ - return (__m512d)_mm512_mask_and_epi64((__v8di)W, K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A); +_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) +{ + return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); +} + +// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as +// outputs. This class of vector operation forms the basis of many scientific +// computations. In vector-reduction arithmetic, the evaluation off is +// independent of the order of the input elements of V. + +// Used bisection method. At each step, we partition the vector with previous +// step in half, and the operation is performed on its two halves. +// This takes log2(n) steps where n is the number of elements in the vector. + +// Vec512 - Vector with size 512. +// Operator - Can be one of following: +,*,&,| +// T2 - Can get 'i' for int and 'f' for float. +// T1 - Can get 'i' for int and 'd' for double. + +#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \ + __extension__({ \ + __m256##T1 Vec256 = __builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, 1, 2, 3) \ + Operator \ + __builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 4, 5, 6, 7); \ + __m128##T1 Vec128 = __builtin_shufflevector( \ + (__v4d##T2)Vec256, \ + (__v4d##T2)Vec256, \ + 0, 1) \ + Operator \ + __builtin_shufflevector( \ + (__v4d##T2)Vec256, \ + (__v4d##T2)Vec256, \ + 2, 3); \ + Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \ + (__v2d##T2)Vec128, 0, -1) \ + Operator \ + __builtin_shufflevector((__v2d##T2)Vec128, \ + (__v2d##T2)Vec128, 1, -1); \ + return Vec128[0]; \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) { + _mm512_reduce_operator_64bit(__W, +, i, i); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) { + _mm512_reduce_operator_64bit(__W, *, i, i); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) { + _mm512_reduce_operator_64bit(__W, &, i, i); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) { + _mm512_reduce_operator_64bit(__W, |, i, i); +} + +static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) { + _mm512_reduce_operator_64bit(__W, +, f, d); +} + +static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) { + _mm512_reduce_operator_64bit(__W, *, f, d); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - All vector elements set to the identity element. +// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0} +// Operator - Can be one of following: +,*,&,| +// Mask - Intrinsic Mask +// T2 - Can get 'i' for int and 'f' for float. +// T1 - Can get 'i' for int and 'd' for packed double-precision. +// T3 - Can be Pd for packed double or q for q-word. + +#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \ + Mask, T2, T1, T3) \ + __extension__({ \ + Vec512 = __builtin_ia32_select##T3##_512( \ + (__mmask8)Mask, \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512Neutral); \ + _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), + &, __M, i, i, q); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, + i, i, q); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, + f, d, pd); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M, + f, d, pd); +} + +// Vec512 - Vector with size 512. +// Operator - Can be one of following: +,*,&,| +// T2 - Can get 'i' for int and ' ' for packed single. +// T1 - Can get 'i' for int and 'f' for float. + +#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \ + __m256##T1 Vec256 = \ + (__m256##T1)(__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, 2, 3, 4, 5, 6, 7) \ + Operator \ + __builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 8, 9, 10, 11, 12, 13, 14, 15)); \ + __m128##T1 Vec128 = \ + (__m128##T1)(__builtin_shufflevector( \ + (__v8s##T2)Vec256, \ + (__v8s##T2)Vec256, \ + 0, 1, 2, 3) \ + Operator \ + __builtin_shufflevector( \ + (__v8s##T2)Vec256, \ + (__v8s##T2)Vec256, \ + 4, 5, 6, 7)); \ + Vec128 = (__m128##T1)(__builtin_shufflevector( \ + (__v4s##T2)Vec128, \ + (__v4s##T2)Vec128, \ + 0, 1, -1, -1) \ + Operator \ + __builtin_shufflevector( \ + (__v4s##T2)Vec128, \ + (__v4s##T2)Vec128, \ + 2, 3, -1, -1)); \ + Vec128 = (__m128##T1)(__builtin_shufflevector( \ + (__v4s##T2)Vec128, \ + (__v4s##T2)Vec128, \ + 0, -1, -1, -1) \ + Operator \ + __builtin_shufflevector( \ + (__v4s##T2)Vec128, \ + (__v4s##T2)Vec128, \ + 1, -1, -1, -1)); \ + return Vec128[0]; \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_add_epi32(__m512i __W) { + _mm512_reduce_operator_32bit(__W, +, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_mul_epi32(__m512i __W) { + _mm512_reduce_operator_32bit(__W, *, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_and_epi32(__m512i __W) { + _mm512_reduce_operator_32bit(__W, &, i, i); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_reduce_or_epi32(__m512i __W) { + _mm512_reduce_operator_32bit(__W, |, i, i); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_reduce_add_ps(__m512 __W) { + _mm512_reduce_operator_32bit(__W, +, f, ); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_reduce_mul_ps(__m512 __W) { + _mm512_reduce_operator_32bit(__W, *, f, ); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - All vector elements set to the identity element. +// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0} +// Operator - Can be one of following: +,*,&,| +// Mask - Intrinsic Mask +// T2 - Can get 'i' for int and 'f' for float. +// T1 - Can get 'i' for int and 'd' for double. +// T3 - Can be Ps for packed single or d for d-word. + +#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \ + Mask, T2, T1, T3) \ + __extension__({ \ + Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ + (__mmask16)Mask, \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512Neutral); \ + _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, + i, i, d); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps); +} + +// Used bisection method. At each step, we partition the vector with previous +// step in half, and the operation is performed on its two halves. +// This takes log2(n) steps where n is the number of elements in the vector. +// This macro uses only intrinsics from the AVX512F feature. + +// Vec512 - Vector with size of 512. +// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example: +// __mm512_max_epi64 +// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}] +// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}] + +#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, 1, 2, 3, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 4, 5, 6, 7, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, 1, -1, -1, -1, -1, -1, -1),\ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 2, 3, -1, -1, -1, -1, -1, \ + -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 0, -1, -1, -1, -1, -1, -1, -1),\ + (__m512##T1)__builtin_shufflevector( \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512, \ + 1, -1, -1, -1, -1, -1, -1, -1))\ + ; \ + return Vec512[0]; \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_reduce_max_epi64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_reduce_max_epu64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_reduce_max_pd(__m512d __V) { + _mm512_reduce_maxMin_64bit(__V, max_pd, d, f); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64 +(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_reduce_min_epu64(__m512i __V) { + _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_reduce_min_pd(__m512d __V) { + _mm512_reduce_maxMin_64bit(__V, min_pd, d, f); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - A 512 length vector with elements set to the identity element +// Identity element: {max_epi,0x8000000000000000} +// {max_epu,0x0000000000000000} +// {max_pd, 0xFFF0000000000000} +// {min_epi,0x7FFFFFFFFFFFFFFF} +// {min_epu,0xFFFFFFFFFFFFFFFF} +// {min_pd, 0x7FF0000000000000} +// +// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example: +// __mm512_max_epi64 +// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}] +// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}] +// T3 - Can get 'q' q word and 'pd' for packed double. +// [__builtin_ia32_select{q|pd}_512] +// Mask - Intrinsic Mask + +#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \ + T2, T3, Mask) \ + __extension__({ \ + Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ + (__mmask8)Mask, \ + (__v8d##T2)Vec512, \ + (__v8d##T2)Vec512Neutral); \ + _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \ + }) + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000), + max_epi64, i, i, q, __M); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000), + max_epu64, i, i, q, __M); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { + _mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()), + max_pd, d, f, pd, __M); +} + +static __inline__ long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), + min_epi64, i, i, q, __M); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), + min_epu64, i, i, q, __M); +} + +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { + _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()), + min_pd, d, f, pd, __M); +} + +// Vec512 - Vector with size 512. +// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example: +// __mm512_max_epi32 +// T1 - Can get 'i' for int and ' ' .[__m512{i|}] +// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}] + +#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, 2, 3, 4, 5, 6, 7, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, 2, 3, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 4, 5, 6, 7, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, 1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 2, 3, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + Vec512 = _mm512_##IntrinName( \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 0, -1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1), \ + (__m512##T1)__builtin_shufflevector( \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512, \ + 1, -1, -1, -1, -1, -1, -1, -1, \ + -1, -1, -1, -1, -1, -1, -1, -1)); \ + return Vec512[0]; \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, max_epi32, i, i); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_reduce_max_epu32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, max_epu32, i, i); +} + +static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) { + _mm512_reduce_maxMin_32bit(a, max_ps, , f); +} + +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, min_epi32, i, i); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_reduce_min_epu32(__m512i a) { + _mm512_reduce_maxMin_32bit(a, min_epu32, i, i); +} + +static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) { + _mm512_reduce_maxMin_32bit(a, min_ps, , f); +} + +// Vec512 - Vector with size 512. +// Vec512Neutral - A 512 length vector with elements set to the identity element +// Identity element: {max_epi,0x80000000} +// {max_epu,0x00000000} +// {max_ps, 0xFF800000} +// {min_epi,0x7FFFFFFF} +// {min_epu,0xFFFFFFFF} +// {min_ps, 0x7F800000} +// +// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example: +// __mm512_max_epi32 +// T1 - Can get 'i' for int and ' ' .[__m512{i|}] +// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}] +// T3 - Can get 'q' q word and 'pd' for packed double. +// [__builtin_ia32_select{q|pd}_512] +// Mask - Intrinsic Mask + +#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \ + T2, T3, Mask) \ + __extension__({ \ + Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ + (__mmask16)Mask, \ + (__v16s##T2)Vec512, \ + (__v16s##T2)Vec512Neutral); \ + _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \ + }) + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32, + i, i, d, __M); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32, + i, i, d, __M); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { + _mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f, + ps, __M); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32, + i, i, d, __M); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32, + i, i, d, __M); +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { + _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f, + ps, __M); } #undef __DEFAULT_FN_ATTRS diff --git a/lib/Headers/avx512vlbwintrin.h b/lib/Headers/avx512vlbwintrin.h index 990e992a113f..3b58d043395a 100644 --- a/lib/Headers/avx512vlbwintrin.h +++ b/lib/Headers/avx512vlbwintrin.h @@ -615,172 +615,143 @@ _mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ - return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __U); +_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_add_epi8(__A, __B), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) - _mm256_setzero_si256 (), - (__mmask32) __U); +_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_add_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); +_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_add_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); +_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_add_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __U); +_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_sub_epi8(__A, __B), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) - _mm256_setzero_si256 (), - (__mmask32) __U); +_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_sub_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); +_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sub_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); +_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sub_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } + static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __U); +_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_add_epi8(__A, __B), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) - _mm_setzero_si128 (), - (__mmask16) __U); +_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_add_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); +_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_add_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); +_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_add_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __U); +_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_sub_epi8(__A, __B), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) - _mm_setzero_si128 (), - (__mmask16) __U); +_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_sub_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); +_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sub_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); +_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sub_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); +_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mullo_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); +_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mullo_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); +_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mullo_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); +_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mullo_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -816,937 +787,802 @@ _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A) +_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A, - (__v16qi) __W, - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_abs_epi8(__A), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A) +_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_abs_epi8(__A), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A) +_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A, - (__v32qi) __W, - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_abs_epi8(__A), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_abs_epi8(__A), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_abs_epi16(__A), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A) +_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_abs_epi16(__A), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A) +_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_abs_epi16(__A), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A) +_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_abs_epi16(__A), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A, - (__v4si) __B, - (__v8hi) _mm_setzero_si128 (), __M); +_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_packs_epi32(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_packs_epi32 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A, - (__v4si) __B, - (__v8hi) __W, __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_packs_epi32(__A, __B), + (__v8hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B) +_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A, - (__v8si) __B, - (__v16hi) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_packs_epi32(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) +_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A, - (__v8si) __B, - (__v16hi) __W, __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_packs_epi32(__A, __B), + (__v16hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B) +_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v16qi) _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_packs_epi16(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v16qi) __W, - __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_packs_epi16(__A, __B), + (__v16qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B) +_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v32qi) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_packs_epi16(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) +_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v32qi) __W, - __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_packs_epi16(__A, __B), + (__v32qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A, - (__v4si) __B, - (__v8hi) _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_packus_epi32(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_packus_epi32 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_packus_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A, - (__v4si) __B, - (__v8hi) __W, __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_packus_epi32(__A, __B), + (__v8hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B) +_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A, - (__v8si) __B, - (__v16hi) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_packus_epi32(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) +_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A, - (__v8si) __B, - (__v16hi) __W, - __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_packus_epi32(__A, __B), + (__v16hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B) +_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v16qi) _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_packus_epi16(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v16qi) __W, - __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_packus_epi16(__A, __B), + (__v16qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B) +_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v32qi) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_packus_epi16(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) +_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v32qi) __W, - __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_packus_epi16(__A, __B), + (__v32qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) +_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_adds_epi8(__A, __B), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_adds_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) +_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_adds_epi8(__A, __B), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_adds_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_adds_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_adds_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) +_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_adds_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_adds_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) +_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_adds_epu8(__A, __B), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_adds_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) +_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_adds_epu8(__A, __B), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_adds_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_adds_epu16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_adds_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) +_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_adds_epu16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_adds_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) +_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_avg_epu8(__A, __B), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_avg_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) +_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_avg_epu8(__A, __B), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_avg_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_avg_epu16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_avg_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) +_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_avg_epu16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_avg_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_max_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_max_epi8(__A, __B), + (__v16qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B) +_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_max_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) +_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_max_epi8(__A, __B), + (__v32qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_max_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_max_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B) +_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_max_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) +_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_max_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B) +_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_max_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_max_epu8(__A, __B), + (__v16qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_max_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) +_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_max_epu8(__A, __B), + (__v32qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_max_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_max_epu16(__A, __B), + (__v8hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B) +_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_max_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) +_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_max_epu16(__A, __B), + (__v16hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_min_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_min_epi8(__A, __B), + (__v16qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B) +_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_min_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) +_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_min_epi8(__A, __B), + (__v32qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_min_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_min_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B) +_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_min_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) +_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_min_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B) +_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_min_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_min_epu8(__A, __B), + (__v16qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_min_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) +_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_min_epu8(__A, __B), + (__v32qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_min_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) +_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_min_epu16(__A, __B), + (__v8hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B) +_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_min_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) +_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_min_epu16(__A, __B), + (__v16hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) +_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_shuffle_epi8(__A, __B), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_shuffle_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) +_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_shuffle_epi8(__A, __B), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_shuffle_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) +_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_subs_epi8(__A, __B), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_subs_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) +_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_subs_epi8(__A, __B), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_subs_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_subs_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_subs_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) +_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_subs_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_subs_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) +_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) __W, - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_subs_epu8(__A, __B), + (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A, - (__v16qi) __B, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_subs_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) +_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) __W, - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_subs_epu8(__A, __B), + (__v32qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A, - (__v32qi) __B, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __U); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_subs_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_subs_epu16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_subs_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) -{ - return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); +_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_subs_epu16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_subs_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -1828,69 +1664,60 @@ _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X, - (__v16qi) __Y, - (__v8hi) __W, - (__mmask8) __U); +_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_maddubs_epi16(__X, __Y), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X, - (__v16qi) __Y, - (__v8hi) _mm_setzero_si128(), - (__mmask8) __U); +_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_maddubs_epi16(__X, __Y), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, - __m256i __Y) { - return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X, - (__v32qi) __Y, - (__v16hi) __W, - (__mmask16) __U); +_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, + __m256i __Y) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_maddubs_epi16(__X, __Y), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X, - (__v32qi) __Y, - (__v16hi) _mm256_setzero_si256(), - (__mmask16) __U); +_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_maddubs_epi16(__X, __Y), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v4si) __W, - (__mmask8) __U); +_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_madd_epi16(__A, __B), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v4si) _mm_setzero_si128(), - (__mmask8) __U); +_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_madd_epi16(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v8si) __W, - (__mmask8) __U); +_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_madd_epi16(__A, __B), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v8si) _mm256_setzero_si256(), - (__mmask8) __U); +_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_madd_epi16(__A, __B), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -2056,104 +1883,89 @@ _mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M); } + static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X, - (__v8hi) __Y, - (__v8hi) __W, - (__mmask8) __U); +_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhrs_epi16(__X, __Y), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X, - (__v8hi) __Y, - (__v8hi) _mm_setzero_si128(), - (__mmask8) __U); +_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhrs_epi16(__X, __Y), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X, - (__v16hi) __Y, - (__v16hi) __W, - (__mmask16) __U); +_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhrs_epi16(__X, __Y), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X, - (__v16hi) __Y, - (__v16hi) _mm256_setzero_si256(), - (__mmask16) __U); +_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhrs_epi16(__X, __Y), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); +_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhi_epu16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128(), - (__mmask8) __U); +_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhi_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); +_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhi_epu16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256(), - (__mmask16) __U); +_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhi_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); +_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhi_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) _mm_setzero_si128(), - (__mmask8) __U); +_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhi_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); +_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhi_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) _mm256_setzero_si256(), - (__mmask16) __U); +_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhi_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -2269,72 +2081,68 @@ _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) +_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtepi8_epi16(__A), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtepi8_epi16(__A), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) +_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtepi8_epi16(__A), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A) +_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtepi8_epi16(__A), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) +_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtepu8_epi16(__A), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtepu8_epi16(__A), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) +_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtepu8_epi16(__A), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtepu8_epi16(__A), + (__v16hi)_mm256_setzero_si256()); } @@ -2461,366 +2269,328 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) (__v16hi)_mm256_setzero_si256()); }) static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_sllv_epi16 (__m256i __A, __m256i __B) +_mm256_sllv_epi16(__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) -1); + return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) +_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sllv_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sllv_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sllv_epi16 (__m128i __A, __m128i __B) +_mm_sllv_epi16(__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_hi (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sllv_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sllv_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sll_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sll_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m128i __B) +_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A, - (__v8hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sll_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A, - (__v8hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sll_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } -#define _mm_mask_slli_epi16(W, U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U)); }) - -#define _mm_maskz_slli_epi16(U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U)); }) - -#define _mm256_mask_slli_epi16(W, U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \ - (__v16hi)(__m256i)(W), \ - (__mmask16)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_slli_epi16(__A, __B), + (__v8hi)__W); +} -#define _mm256_maskz_slli_epi16(U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_slli_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_slli_epi16(__A, __B), + (__v16hi)__W); +} +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_slli_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_srlv_epi16 (__m256i __A, __m256i __B) +_mm256_srlv_epi16(__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) -1); + return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) +_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srlv_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srlv_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srlv_epi16 (__m128i __A, __m128i __B) +_mm_srlv_epi16(__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_hi (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srlv_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srlv_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_srav_epi16 (__m256i __A, __m256i __B) +_mm256_srav_epi16(__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) -1); + return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) +_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srav_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, - (__v16hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srav_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srav_epi16 (__m128i __A, __m128i __B) +_mm_srav_epi16(__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_hi (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srav_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srav_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sra_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sra_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m128i __B) +_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A, - (__v8hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sra_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A, - (__v8hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sra_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } -#define _mm_mask_srai_epi16(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srai_epi16(__A, __B), + (__v8hi)__W); +} -#define _mm_maskz_srai_epi16(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srai_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} -#define _mm256_mask_srai_epi16(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \ - (__v16hi)(__m256i)(W), \ - (__mmask16)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srai_epi16(__A, __B), + (__v16hi)__W); +} -#define _mm256_maskz_srai_epi16(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srai_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srl_epi16(__A, __B), + (__v8hi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A, - (__v8hi) __B, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srl_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A, - __m128i __B) +_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A, - (__v8hi) __B, - (__v16hi) __W, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srl_epi16(__A, __B), + (__v16hi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A, - (__v8hi) __B, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srl_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } -#define _mm_mask_srli_epi16(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srli_epi16(__A, __B), + (__v8hi)__W); +} -#define _mm_maskz_srli_epi16(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srli_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} -#define _mm256_mask_srli_epi16(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \ - (__v16hi)(__m256i)(W), \ - (__mmask16)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srli_epi16(__A, __B), + (__v16hi)__W); +} -#define _mm256_maskz_srli_epi16(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srli_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) @@ -3342,28 +3112,24 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, } #define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \ - (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(N), \ - (__v16qi)(__m128i)(W), \ - (__mmask16)(U)); }) + (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ + (__v16qi)(__m128i)(W)); }) #define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \ - (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(N), \ - (__v16qi)_mm_setzero_si128(), \ - (__mmask16)(U)); }) + (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ + (__v16qi)_mm_setzero_si128()); }) #define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \ - (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), (int)(N), \ - (__v32qi)(__m256i)(W), \ - (__mmask32)(U)); }) + (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ + (__v32qi)(__m256i)(W)); }) #define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \ - (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), (int)(N), \ - (__v32qi)_mm256_setzero_si256(), \ - (__mmask32)(U)); }) + (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ + (__v32qi)_mm256_setzero_si256()); }) #define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \ (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \ diff --git a/lib/Headers/avx512vldqintrin.h b/lib/Headers/avx512vldqintrin.h index 8187bcd6b28e..cd9da4370564 100644 --- a/lib/Headers/avx512vldqintrin.h +++ b/lib/Headers/avx512vldqintrin.h @@ -37,20 +37,17 @@ _mm256_mullo_epi64 (__m256i __A, __m256i __B) { } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); +_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_mullo_epi64(__A, __B), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); +_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_mullo_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -59,293 +56,241 @@ _mm_mullo_epi64 (__m128i __A, __m128i __B) { } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); +_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_mullo_epi64(__A, __B), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); +_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_mullo_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_andnot_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_andnot_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_andnot_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_andnot_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_andnot_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_andnot_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_andnot_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_andnot_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_and_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_and_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_and_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_and_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_and_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_and_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_and_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_and_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_xor_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_xor_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_xor_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_xor_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_xor_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_xor_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_xor_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_xor_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_or_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_or_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_or_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_or_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_or_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_or_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_or_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_or_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -1151,82 +1096,72 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) } #define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1); }) + (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A), \ + (__v4df)_mm256_undefined_pd(), \ + ((imm) & 1) ? 2 : 0, \ + ((imm) & 1) ? 3 : 1); }) #define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U)); }) + (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm256_extractf64x2_pd((A), (imm)), \ + (__v2df)(W)); }) #define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)); }) + (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm256_extractf64x2_pd((A), (imm)), \ + (__v2df)_mm_setzero_pd()); }) #define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ - (int)(imm), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)-1); }) + (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A), \ + (__v4di)_mm256_undefined_si256(), \ + ((imm) & 1) ? 2 : 0, \ + ((imm) & 1) ? 3 : 1); }) #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ - (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \ + (__v2di)(W)); }) #define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ - (int)(imm), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \ + (__v2di)_mm_setzero_di()); }) #define _mm256_insertf64x2(A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1); }) + (__m256d)__builtin_shufflevector((__v4df)(A), \ + (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \ + ((imm) & 0x1) ? 0 : 4, \ + ((imm) & 0x1) ? 1 : 5, \ + ((imm) & 0x1) ? 4 : 2, \ + ((imm) & 0x1) ? 5 : 3); }) #define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ + (__v4df)(W)); }) #define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd()); }) #define _mm256_inserti64x2(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \ - (__v2di)(__m128i)(B), \ - (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v4di)(A), \ + (__v4di)_mm256_castsi128_si256((__m128i)(B)), \ + ((imm) & 0x1) ? 0 : 4, \ + ((imm) & 0x1) ? 1 : 5, \ + ((imm) & 0x1) ? 4 : 2, \ + ((imm) & 0x1) ? 5 : 3); }) #define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \ - (__v2di)(__m128i)(B), \ - (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ + (__v4di)(W)); }) #define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \ - (__v2di)(__m128i)(B), \ - (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256()); }) #define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ diff --git a/lib/Headers/avx512vlintrin.h b/lib/Headers/avx512vlintrin.h index 295ce291f7ce..f3744da6ab8a 100644 --- a/lib/Headers/avx512vlintrin.h +++ b/lib/Headers/avx512vlintrin.h @@ -616,277 +616,227 @@ _mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_add_epi32(__A, __B), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_add_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_add_epi64(__A, __B), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_add_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sub_epi32(__A, __B), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sub_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sub_epi64(__A, __B), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sub_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_add_epi32(__A, __B), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_add_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_add_epi64(__A, __B), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_add_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sub_epi32(__A, __B), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sub_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sub_epi64(__A, __B), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sub_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, - (__v8si) __Y, - (__v4di) __W, __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_mul_epi32(__X, __Y), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, - (__v8si) __Y, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_mul_epi32(__X, __Y), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, - __m128i __Y) +_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, - (__v4si) __Y, - (__v2di) __W, __M); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_mul_epi32(__X, __Y), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) +_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, - (__v4si) __Y, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_mul_epi32(__X, __Y), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, - (__v8si) __Y, - (__v4di) __W, __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_mul_epu32(__X, __Y), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, - (__v8si) __Y, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_mul_epu32(__X, __Y), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, - __m128i __Y) +_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, - (__v4si) __Y, - (__v2di) __W, __M); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_mul_epu32(__X, __Y), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) +_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, - (__v4si) __Y, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_mul_epu32(__X, __Y), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_mullo_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) +_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_mullo_epi32(__A, __B), + (__v8si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_mullo_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) +_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_mullo_epi32(__A, __B), + (__v4si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS @@ -1895,71 +1845,59 @@ _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_add_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_add_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_add_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_add_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_add_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_add_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_add_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_add_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -2196,32 +2134,30 @@ _mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) { static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, + (__v2df)_mm_cvtepi32_pd(__A), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { - return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, + (__v2df)_mm_cvtepi32_pd(__A), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) { - return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, + (__v4df)_mm256_cvtepi32_pd(__A), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { - return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, + (__v4df)_mm256_cvtepi32_pd(__A), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -2620,48 +2556,41 @@ _mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) { static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepu32_pd (__m128i __A) { - return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); + return (__m128d) __builtin_convertvector( + __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, + (__v2df)_mm_cvtepu32_pd(__A), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { - return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, + (__v2df)_mm_cvtepu32_pd(__A), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_cvtepu32_pd (__m128i __A) { - return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); + return (__m256d)__builtin_convertvector((__v4su)__A, __v4df); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) { - return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, + (__v4df)_mm256_cvtepu32_pd(__A), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { - return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, + (__v4df)_mm256_cvtepu32_pd(__A), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -2711,72 +2640,59 @@ _mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) { } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_div_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_div_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_div_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_div_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_div_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_div_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_div_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_div_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -3127,240 +3043,199 @@ _mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) { } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_max_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_max_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_max_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_max_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_max_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_max_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_max_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_max_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_min_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_min_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_min_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_min_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_min_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_min_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_min_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_min_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_mul_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_mul_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_mul_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_mul_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_mul_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_mul_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_mul_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_mul_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) __U); +_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_abs_epi32(__A), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); +_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_abs_epi32(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); +_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U, + (__v8si)_mm256_abs_epi32(__A), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); +_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U, + (__v8si)_mm256_abs_epi32(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -3410,37 +3285,31 @@ _mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) { } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); +_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_max_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); +_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_max_epi32(__A, __B), + (__v4si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); +_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_max_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); +_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_max_epi32(__A, __B), + (__v8si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -3496,37 +3365,31 @@ _mm256_max_epi64 (__m256i __A, __m256i __B) { } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); +_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_max_epu32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); +_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_max_epu32(__A, __B), + (__v4si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); +_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_max_epu32(__A, __B), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); +_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_max_epu32(__A, __B), + (__v8si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -3582,37 +3445,31 @@ _mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); +_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_min_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); +_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_min_epi32(__A, __B), + (__v4si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); +_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_min_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); +_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_min_epi32(__A, __B), + (__v8si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -3668,37 +3525,31 @@ _mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - __M); +_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_min_epu32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, __M); +_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_min_epu32(__A, __B), + (__v4si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - __M); +_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_min_epu32(__A, __B), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, __M); +_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_min_epu32(__A, __B), + (__v8si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -4095,132 +3946,115 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)(__m256i)(v1), (int)(scale)); }) static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sqrt_pd(__A), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sqrt_pd(__A), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sqrt_pd(__A), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sqrt_pd(__A), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sqrt_ps(__A), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sqrt_ps(__A), + (__v4sf)_mm_setzero_pd()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sqrt_ps(__A), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sqrt_ps(__A), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); +_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sub_pd(__A, __B), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); +_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sub_pd(__A, __B), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); +_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sub_pd(__A, __B), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); +_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sub_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); +_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sub_ps(__A, __B), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sub_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); +_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sub_ps(__A, __B), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sub_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -4551,344 +4385,324 @@ _mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A, } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi8_epi32(__A), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi8_epi32(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi8_epi32(__A), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi8_epi32(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi8_epi64(__A), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi8_epi64(__A), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi8_epi64(__A), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi8_epi64(__A), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) +_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi32_epi64(__X), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) +_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi32_epi64(__X), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) +_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) { - return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi32_epi64(__X), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) +_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { - return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi32_epi64(__X), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi16_epi32(__A), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi16_epi32(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi16_epi32(__A), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi16_epi32(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi16_epi64(__A), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi16_epi64(__A), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi16_epi64(__A), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi16_epi64(__A), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu8_epi32(__A), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu8_epi32(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu8_epi32(__A), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) +_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu8_epi32(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu8_epi64(__A), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu8_epi64(__A), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu8_epi64(__A), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu8_epi64(__A), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) +_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu32_epi64(__X), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) +_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { - return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu32_epi64(__X), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) +_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) { - return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu32_epi64(__X), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) +_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { - return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu32_epi64(__X), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu16_epi32(__A), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu16_epi32(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu16_epi32(__A), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu16_epi32(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu16_epi64(__A), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu16_epi64(__A), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu16_epi64(__A), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu16_epi64(__A), + (__v4di)_mm256_setzero_si256()); } @@ -5125,125 +4939,132 @@ _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) (__mmask8)(U)); }) static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sll_epi32(__A, __B), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sll_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, - (__v4si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sll_epi32(__A, __B), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, - (__v4si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sll_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } -#define _mm_mask_slli_epi32(W, U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_slli_epi32(__A, __B), + (__v4si)__W); +} -#define _mm_maskz_slli_epi32(U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_slli_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} -#define _mm256_mask_slli_epi32(W, U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_slli_epi32(__A, __B), + (__v8si)__W); +} -#define _mm256_maskz_slli_epi32(U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_slli_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sll_epi64(__A, __B), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sll_epi64(__A, __B), + (__v2di)_mm_setzero_di()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sll_epi64(__A, __B), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sll_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } -#define _mm_mask_slli_epi64(W, U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U)); }) - -#define _mm_maskz_slli_epi64(U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_slli_epi64(__A, __B), + (__v2di)__W); +} -#define _mm256_mask_slli_epi64(W, U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_slli_epi64(__A, __B), + (__v2di)_mm_setzero_di()); +} -#define _mm256_maskz_slli_epi64(U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_slli_epi64(__A, __B), + (__v4di)__W); +} +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_slli_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_rorv_epi32 (__m128i __A, __m128i __B) @@ -5366,387 +5187,335 @@ _mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sllv_epi64(__X, __Y), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sllv_epi64(__X, __Y), + (__v2di)_mm_setzero_di()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sllv_epi64(__X, __Y), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sllv_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sllv_epi32(__X, __Y), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sllv_epi32(__X, __Y), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sllv_epi32(__X, __Y), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sllv_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256()); } - - static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srlv_epi64(__X, __Y), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srlv_epi64(__X, __Y), + (__v2di)_mm_setzero_di()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srlv_epi64(__X, __Y), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srlv_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srlv_epi32(__X, __Y), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srlv_epi32(__X, __Y), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srlv_epi32(__X, __Y), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srlv_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256()); } - - static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srl_epi32(__A, __B), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srl_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, - (__v4si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srl_epi32(__A, __B), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, - (__v4si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srl_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } -#define _mm_mask_srli_epi32(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srli_epi32(__A, __B), + (__v4si)__W); +} -#define _mm_maskz_srli_epi32(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srli_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} -#define _mm256_mask_srli_epi32(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srli_epi32(__A, __B), + (__v8si)__W); +} -#define _mm256_maskz_srli_epi32(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srli_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srl_epi64(__A, __B), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srl_epi64(__A, __B), + (__v2di)_mm_setzero_di()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srl_epi64(__A, __B), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srl_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } -#define _mm_mask_srli_epi64(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srli_epi64(__A, __B), + (__v2di)__W); +} -#define _mm_maskz_srli_epi64(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srli_epi64(__A, __B), + (__v2di)_mm_setzero_di()); +} -#define _mm256_mask_srli_epi64(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srli_epi64(__A, __B), + (__v4di)__W); +} -#define _mm256_maskz_srli_epi64(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srli_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srav_epi32(__X, __Y), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, - (__v4si) __Y, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srav_epi32(__X, __Y), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srav_epi32(__X, __Y), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, - (__v8si) __Y, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srav_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srav_epi64 (__m128i __X, __m128i __Y) +_mm_srav_epi64(__m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) - _mm_setzero_di (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) +_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srav_epi64(__X, __Y), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srav_epi64(__X, __Y), + (__v2di)_mm_setzero_di()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_srav_epi64 (__m256i __X, __m256i __Y) +_mm256_srav_epi64(__m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) +_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srav_epi64(__X, __Y), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srav_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -5975,6 +5744,7 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) (__v8si)_mm256_setzero_si256(), \ (__mmask8)(M)); }) +#ifdef __x86_64__ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { @@ -6006,6 +5776,7 @@ _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) _mm256_setzero_si256 (), __M); } +#endif #define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \ (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ @@ -6653,85 +6424,67 @@ _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) (__v8sf)_mm256_setzero_ps()); }) static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A, - __m128i __C) +_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) { - return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, - (__v2di) __C, - (__v2df) __W, - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_permutevar_pd(__A, __C), + (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C) +_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) { - return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, - (__v2di) __C, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_permutevar_pd(__A, __C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256i __C) +_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) { - return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, - (__v4di) __C, - (__v4df) __W, - (__mmask8) - __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_permutevar_pd(__A, __C), + (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C) +_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) { - return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, - (__v4di) __C, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) - __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_permutevar_pd(__A, __C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A, - __m128i __C) +_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) { - return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, - (__v4si) __C, - (__v4sf) __W, - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_permutevar_ps(__A, __C), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C) +_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) { - return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, - (__v4si) __C, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_permutevar_ps(__A, __C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A, - __m256i __C) +_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) { - return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, - (__v8si) __C, - (__v8sf) __W, - (__mmask8) __U); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_permutevar_ps(__A, __C), + (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C) +_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) { - return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, - (__v8si) __C, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_permutevar_ps(__A, __C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS @@ -6985,154 +6738,156 @@ _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sra_epi32(__A, __B), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sra_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, - (__v4si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sra_epi32(__A, __B), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, - (__v4si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sra_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } -#define _mm_mask_srai_epi32(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srai_epi32(__A, __B), + (__v4si)__W); +} -#define _mm_maskz_srai_epi32(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srai_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} -#define _mm256_mask_srai_epi32(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srai_epi32(__A, __B), + (__v8si)__W); +} -#define _mm256_maskz_srai_epi32(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srai_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sra_epi64 (__m128i __A, __m128i __B) +_mm_sra_epi64(__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_di (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ + (__v2di)_mm_sra_epi64(__A, __B), \ + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ + (__v2di)_mm_sra_epi64(__A, __B), \ + (__v2di)_mm_setzero_di()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_sra_epi64 (__m256i __A, __m128i __B) +_mm256_sra_epi64(__m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) +_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm256_sra_epi64(__A, __B), \ + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, - (__v2di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm256_sra_epi64(__A, __B), \ + (__v4di)_mm256_setzero_si256()); } -#define _mm_srai_epi64(A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)-1); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srai_epi64(__m128i __A, int __imm) +{ + return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm); +} -#define _mm_mask_srai_epi64(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ + (__v2di)_mm_srai_epi64(__A, __imm), \ + (__v2di)__W); +} -#define _mm_maskz_srai_epi64(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)(U)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ + (__v2di)_mm_srai_epi64(__A, __imm), \ + (__v2di)_mm_setzero_di()); +} -#define _mm256_srai_epi64(A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)-1); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_srai_epi64(__m256i __A, int __imm) +{ + return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm); +} -#define _mm256_mask_srai_epi64(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm256_srai_epi64(__A, __imm), \ + (__v4di)__W); +} -#define _mm256_maskz_srai_epi64(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm256_srai_epi64(__A, __imm), \ + (__v4di)_mm256_setzero_si256()); +} #define _mm_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \ (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ @@ -8473,79 +8228,84 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) } #define _mm256_extractf32x4_ps(A, imm) __extension__ ({ \ - (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1); }) + (__m128)__builtin_shufflevector((__v8sf)(__m256)(A), \ + (__v8sf)_mm256_undefined_ps(), \ + ((imm) & 1) ? 4 : 0, \ + ((imm) & 1) ? 5 : 1, \ + ((imm) & 1) ? 6 : 2, \ + ((imm) & 1) ? 7 : 3); }) #define _mm256_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({ \ - (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U)); }) + (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \ + (__v4sf)(W)); }) #define _mm256_maskz_extractf32x4_ps(U, A, imm) __extension__ ({ \ - (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)); }) + (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \ + (__v4sf)_mm_setzero_ps()); }) #define _mm256_extracti32x4_epi32(A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)-1); }) + (__m128i)__builtin_shufflevector((__v8si)(__m256)(A), \ + (__v8si)_mm256_undefined_si256(), \ + ((imm) & 1) ? 4 : 0, \ + ((imm) & 1) ? 5 : 1, \ + ((imm) & 1) ? 6 : 2, \ + ((imm) & 1) ? 7 : 3); }) #define _mm256_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \ + (__v4si)(W)); }) #define _mm256_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \ + (__v4si)_mm_setzero_si128()); }) #define _mm256_insertf32x4(A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__m256)__builtin_shufflevector((__v8sf)(A), \ + (__v8sf)_mm256_castps128_ps256((__m128)(B)), \ + ((imm) & 0x1) ? 0 : 8, \ + ((imm) & 0x1) ? 1 : 9, \ + ((imm) & 0x1) ? 2 : 10, \ + ((imm) & 0x1) ? 3 : 11, \ + ((imm) & 0x1) ? 8 : 4, \ + ((imm) & 0x1) ? 9 : 5, \ + ((imm) & 0x1) ? 10 : 6, \ + ((imm) & 0x1) ? 11 : 7); }) #define _mm256_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ + (__v8sf)(W)); }) #define _mm256_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \ - (__v4sf)(__m128)(B), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps()); }) #define _mm256_inserti32x4(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \ - (__v4si)(__m128i)(B), \ - (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v8si)(A), \ + (__v8si)_mm256_castsi128_si256((__m128i)(B)), \ + ((imm) & 0x1) ? 0 : 8, \ + ((imm) & 0x1) ? 1 : 9, \ + ((imm) & 0x1) ? 2 : 10, \ + ((imm) & 0x1) ? 3 : 11, \ + ((imm) & 0x1) ? 8 : 4, \ + ((imm) & 0x1) ? 9 : 5, \ + ((imm) & 0x1) ? 10 : 6, \ + ((imm) & 0x1) ? 11 : 7); }) #define _mm256_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \ - (__v4si)(__m128i)(B), \ - (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ + (__v8si)(W)); }) #define _mm256_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \ - (__v4si)(__m128i)(B), \ - (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256()); }) #define _mm_getmant_pd(A, B, C) __extension__({\ (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ @@ -8860,76 +8620,78 @@ _mm256_permutexvar_epi32 (__m256i __X, __m256i __Y) } #define _mm_alignr_epi32(A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ - (__mmask8)-1); }) + (__m128i)__builtin_shufflevector((__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(A), \ + ((int)(imm) & 0x3) + 0, \ + ((int)(imm) & 0x3) + 1, \ + ((int)(imm) & 0x3) + 2, \ + ((int)(imm) & 0x3) + 3); }) #define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ + (__v4si)(__m128i)(W)); }) #define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ + (__v4si)_mm_setzero_si128()); }) #define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(imm), \ - (__v8si)_mm256_undefined_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(A), \ + ((int)(imm) & 0x7) + 0, \ + ((int)(imm) & 0x7) + 1, \ + ((int)(imm) & 0x7) + 2, \ + ((int)(imm) & 0x7) + 3, \ + ((int)(imm) & 0x7) + 4, \ + ((int)(imm) & 0x7) + 5, \ + ((int)(imm) & 0x7) + 6, \ + ((int)(imm) & 0x7) + 7); }) #define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ + (__v8si)(__m256i)(W)); }) #define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256()); }) #define _mm_alignr_epi64(A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(imm), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)-1); }) + (__m128i)__builtin_shufflevector((__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(A), \ + ((int)(imm) & 0x1) + 0, \ + ((int)(imm) & 0x1) + 1); }) #define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ + (__v2di)(__m128i)(W)); }) #define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(imm), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ + (__v2di)_mm_setzero_di()); }) #define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v4di)_mm256_undefined_pd(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(A), \ + ((int)(imm) & 0x3) + 0, \ + ((int)(imm) & 0x3) + 1, \ + ((int)(imm) & 0x3) + 2, \ + ((int)(imm) & 0x3) + 3); }) #define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ + (__v4di)(__m256i)(W)); }) #define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256()); }) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h index 32e8546817b3..be03ba346031 100644 --- a/lib/Headers/avxintrin.h +++ b/lib/Headers/avxintrin.h @@ -57,7 +57,7 @@ typedef long long __m256i __attribute__((__vector_size__(32))); /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction. +/// This intrinsic corresponds to the <c> VADDPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the source operands. @@ -75,7 +75,7 @@ _mm256_add_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction. +/// This intrinsic corresponds to the <c> VADDPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the source operands. @@ -93,7 +93,7 @@ _mm256_add_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction. +/// This intrinsic corresponds to the <c> VSUBPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing the minuend. @@ -111,7 +111,7 @@ _mm256_sub_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction. +/// This intrinsic corresponds to the <c> VSUBPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing the minuend. @@ -130,7 +130,7 @@ _mm256_sub_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction. +/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing the left source operand. @@ -149,7 +149,7 @@ _mm256_addsub_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction. +/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing the left source operand. @@ -167,7 +167,7 @@ _mm256_addsub_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction. +/// This intrinsic corresponds to the <c> VDIVPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing the dividend. @@ -185,7 +185,7 @@ _mm256_div_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction. +/// This intrinsic corresponds to the <c> VDIVPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing the dividend. @@ -204,7 +204,7 @@ _mm256_div_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction. +/// This intrinsic corresponds to the <c> VMAXPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the operands. @@ -223,7 +223,7 @@ _mm256_max_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction. +/// This intrinsic corresponds to the <c> VMAXPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the operands. @@ -242,7 +242,7 @@ _mm256_max_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMINPD / MINPD instruction. +/// This intrinsic corresponds to the <c> VMINPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the operands. @@ -261,7 +261,7 @@ _mm256_min_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMINPS / MINPS instruction. +/// This intrinsic corresponds to the <c> VMINPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the operands. @@ -279,7 +279,7 @@ _mm256_min_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMULPD / MULPD instruction. +/// This intrinsic corresponds to the <c> VMULPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the operands. @@ -297,7 +297,7 @@ _mm256_mul_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMULPS / MULPS instruction. +/// This intrinsic corresponds to the <c> VMULPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the operands. @@ -316,7 +316,7 @@ _mm256_mul_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction. +/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double]. @@ -333,7 +333,7 @@ _mm256_sqrt_pd(__m256d __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction. +/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float]. @@ -350,7 +350,7 @@ _mm256_sqrt_ps(__m256 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction. +/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float]. @@ -367,7 +367,7 @@ _mm256_rsqrt_ps(__m256 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction. +/// This intrinsic corresponds to the <c> VRCPPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float]. @@ -389,24 +389,24 @@ _mm256_rcp_ps(__m256 __a) /// __m256d _mm256_round_pd(__m256d V, const int M); /// \endcode /// -/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. +/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. /// /// \param V /// A 256-bit vector of [4 x double]. /// \param M -/// An integer value that specifies the rounding operation. -/// Bits [7:4] are reserved. -/// Bit [3] is a precision exception value: -/// 0: A normal PE exception is used. -/// 1: The PE field is not updated. -/// Bit [2] is the rounding control source: -/// 0: Use bits [1:0] of M. -/// 1: Use the current MXCSR setting. -/// Bits [1:0] contain the rounding control definition: -/// 00: Nearest. -/// 01: Downward (toward negative infinity). -/// 10: Upward (toward positive infinity). -/// 11: Truncated. +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used. \n +/// 1: The PE field is not updated. \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M. \n +/// 1: Use the current MXCSR setting. \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest. \n +/// 01: Downward (toward negative infinity). \n +/// 10: Upward (toward positive infinity). \n +/// 11: Truncated. /// \returns A 256-bit vector of [4 x double] containing the rounded values. #define _mm256_round_pd(V, M) __extension__ ({ \ (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) @@ -421,24 +421,24 @@ _mm256_rcp_ps(__m256 __a) /// __m256 _mm256_round_ps(__m256 V, const int M); /// \endcode /// -/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. +/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. /// /// \param V /// A 256-bit vector of [8 x float]. /// \param M -/// An integer value that specifies the rounding operation. -/// Bits [7:4] are reserved. -/// Bit [3] is a precision exception value: -/// 0: A normal PE exception is used. -/// 1: The PE field is not updated. -/// Bit [2] is the rounding control source: -/// 0: Use bits [1:0] of M. -/// 1: Use the current MXCSR setting. -/// Bits [1:0] contain the rounding control definition: -/// 00: Nearest. -/// 01: Downward (toward negative infinity). -/// 10: Upward (toward positive infinity). -/// 11: Truncated. +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used. \n +/// 1: The PE field is not updated. \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M. \n +/// 1: Use the current MXCSR setting. \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest. \n +/// 01: Downward (toward negative infinity). \n +/// 10: Upward (toward positive infinity). \n +/// 11: Truncated. /// \returns A 256-bit vector of [8 x float] containing the rounded values. #define _mm256_round_ps(V, M) __extension__ ({ \ (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) @@ -453,7 +453,7 @@ _mm256_rcp_ps(__m256 __a) /// __m256d _mm256_ceil_pd(__m256d V); /// \endcode /// -/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. +/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. /// /// \param V /// A 256-bit vector of [4 x double]. @@ -470,7 +470,7 @@ _mm256_rcp_ps(__m256 __a) /// __m256d _mm256_floor_pd(__m256d V); /// \endcode /// -/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction. +/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. /// /// \param V /// A 256-bit vector of [4 x double]. @@ -488,7 +488,7 @@ _mm256_rcp_ps(__m256 __a) /// __m256 _mm256_ceil_ps(__m256 V); /// \endcode /// -/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. +/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. /// /// \param V /// A 256-bit vector of [8 x float]. @@ -505,7 +505,7 @@ _mm256_rcp_ps(__m256 __a) /// __m256 _mm256_floor_ps(__m256 V); /// \endcode /// -/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction. +/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. /// /// \param V /// A 256-bit vector of [8 x float]. @@ -517,7 +517,7 @@ _mm256_rcp_ps(__m256 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction. +/// This intrinsic corresponds to the <c> VANDPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the source operands. @@ -535,7 +535,7 @@ _mm256_and_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction. +/// This intrinsic corresponds to the <c> VANDPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the source operands. @@ -554,7 +554,7 @@ _mm256_and_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction. +/// This intrinsic corresponds to the <c> VANDNPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing the left source operand. The @@ -575,7 +575,7 @@ _mm256_andnot_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction. +/// This intrinsic corresponds to the <c> VANDNPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing the left source operand. The @@ -595,7 +595,7 @@ _mm256_andnot_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VORPD / ORPD instruction. +/// This intrinsic corresponds to the <c> VORPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the source operands. @@ -613,7 +613,7 @@ _mm256_or_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VORPS / ORPS instruction. +/// This intrinsic corresponds to the <c> VORPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the source operands. @@ -631,7 +631,7 @@ _mm256_or_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VXORPD / XORPD instruction. +/// This intrinsic corresponds to the <c> VXORPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the source operands. @@ -649,7 +649,7 @@ _mm256_xor_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VXORPS / XORPS instruction. +/// This intrinsic corresponds to the <c> VXORPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the source operands. @@ -669,7 +669,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction. +/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the source operands. @@ -692,7 +692,7 @@ _mm256_hadd_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction. +/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the source operands. @@ -715,7 +715,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction. +/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double] containing one of the source operands. @@ -738,7 +738,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction. +/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float] containing one of the source operands. @@ -762,23 +762,23 @@ _mm256_hsub_ps(__m256 __a, __m256 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. +/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. /// /// \param __a /// A 128-bit vector of [2 x double]. /// \param __c /// A 128-bit integer vector operand specifying how the values are to be -/// copied. -/// Bit [1]: -/// 0: Bits [63:0] of the source are copied to bits [63:0] of the -/// returned vector. -/// 1: Bits [127:64] of the source are copied to bits [63:0] of the -/// returned vector. -/// Bit [65]: -/// 0: Bits [63:0] of the source are copied to bits [127:64] of the -/// returned vector. -/// 1: Bits [127:64] of the source are copied to bits [127:64] of the -/// returned vector. +/// copied. \n +/// Bit [1]: \n +/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned +/// vector. \n +/// 1: Bits [127:64] of the source are copied to bits [63:0] of the +/// returned vector. \n +/// Bit [65]: \n +/// 0: Bits [63:0] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// 1: Bits [127:64] of the source are copied to bits [127:64] of the +/// returned vector. /// \returns A 128-bit vector of [2 x double] containing the copied values. static __inline __m128d __DEFAULT_FN_ATTRS _mm_permutevar_pd(__m128d __a, __m128i __c) @@ -786,37 +786,37 @@ _mm_permutevar_pd(__m128d __a, __m128i __c) return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); } -/// \brief Copies the values in a 256-bit vector of [4 x double] as -/// specified by the 256-bit integer vector operand. +/// \brief Copies the values in a 256-bit vector of [4 x double] as specified +/// by the 256-bit integer vector operand. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. +/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double]. /// \param __c /// A 256-bit integer vector operand specifying how the values are to be -/// copied. -/// Bit [1]: -/// 0: Bits [63:0] of the source are copied to bits [63:0] of the -/// returned vector. -/// 1: Bits [127:64] of the source are copied to bits [63:0] of the -/// returned vector. -/// Bit [65]: -/// 0: Bits [63:0] of the source are copied to bits [127:64] of the -/// returned vector. -/// 1: Bits [127:64] of the source are copied to bits [127:64] of the -/// returned vector. -/// Bit [129]: -/// 0: Bits [191:128] of the source are copied to bits [191:128] of the -/// returned vector. -/// 1: Bits [255:192] of the source are copied to bits [191:128] of the -/// returned vector. -/// Bit [193]: -/// 0: Bits [191:128] of the source are copied to bits [255:192] of the -/// returned vector. -/// 1: Bits [255:192] of the source are copied to bits [255:192] of the +/// copied. \n +/// Bit [1]: \n +/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned +/// vector. \n +/// 1: Bits [127:64] of the source are copied to bits [63:0] of the +/// returned vector. \n +/// Bit [65]: \n +/// 0: Bits [63:0] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// 1: Bits [127:64] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// Bit [129]: \n +/// 0: Bits [191:128] of the source are copied to bits [191:128] of the +/// returned vector. \n +/// 1: Bits [255:192] of the source are copied to bits [191:128] of the +/// returned vector. \n +/// Bit [193]: \n +/// 0: Bits [191:128] of the source are copied to bits [255:192] of the +/// returned vector. \n +/// 1: Bits [255:192] of the source are copied to bits [255:192] of the /// returned vector. /// \returns A 256-bit vector of [4 x double] containing the copied values. static __inline __m256d __DEFAULT_FN_ATTRS @@ -827,52 +827,51 @@ _mm256_permutevar_pd(__m256d __a, __m256i __c) /// \brief Copies the values stored in a 128-bit vector of [4 x float] as /// specified by the 128-bit integer vector operand. -/// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. +/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. /// \param __c /// A 128-bit integer vector operand specifying how the values are to be -/// copied. -/// Bits [1:0]: -/// 00: Bits [31:0] of the source are copied to bits [31:0] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [31:0] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [31:0] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [31:0] of the -/// returned vector. -/// Bits [33:32]: -/// 00: Bits [31:0] of the source are copied to bits [63:32] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [63:32] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [63:32] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [63:32] of the -/// returned vector. -/// Bits [65:64]: -/// 00: Bits [31:0] of the source are copied to bits [95:64] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [95:64] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [95:64] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [95:64] of the -/// returned vector. -/// Bits [97:96]: -/// 00: Bits [31:0] of the source are copied to bits [127:96] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [127:96] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [127:96] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [127:96] of the -/// returned vector. +/// copied. \n +/// Bits [1:0]: \n +/// 00: Bits [31:0] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// Bits [33:32]: \n +/// 00: Bits [31:0] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// Bits [65:64]: \n +/// 00: Bits [31:0] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// Bits [97:96]: \n +/// 00: Bits [31:0] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [127:96] of the +/// returned vector. /// \returns A 128-bit vector of [4 x float] containing the copied values. static __inline __m128 __DEFAULT_FN_ATTRS _mm_permutevar_ps(__m128 __a, __m128i __c) @@ -885,85 +884,85 @@ _mm_permutevar_ps(__m128 __a, __m128i __c) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. +/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float]. /// \param __c /// A 256-bit integer vector operand specifying how the values are to be -/// copied. -/// Bits [1:0]: -/// 00: Bits [31:0] of the source are copied to bits [31:0] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [31:0] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [31:0] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [31:0] of the -/// returned vector. -/// Bits [33:32]: -/// 00: Bits [31:0] of the source are copied to bits [63:32] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [63:32] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [63:32] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [63:32] of the -/// returned vector. -/// Bits [65:64]: -/// 00: Bits [31:0] of the source are copied to bits [95:64] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [95:64] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [95:64] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [95:64] of the -/// returned vector. -/// Bits [97:96]: -/// 00: Bits [31:0] of the source are copied to bits [127:96] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [127:96] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [127:96] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [127:96] of the -/// returned vector. -/// Bits [129:128]: -/// 00: Bits [159:128] of the source are copied to bits [159:128] of the -/// returned vector. -/// 01: Bits [191:160] of the source are copied to bits [159:128] of the -/// returned vector. -/// 10: Bits [223:192] of the source are copied to bits [159:128] of the -/// returned vector. -/// 11: Bits [255:224] of the source are copied to bits [159:128] of the -/// returned vector. -/// Bits [161:160]: -/// 00: Bits [159:128] of the source are copied to bits [191:160] of the -/// returned vector. -/// 01: Bits [191:160] of the source are copied to bits [191:160] of the -/// returned vector. -/// 10: Bits [223:192] of the source are copied to bits [191:160] of the -/// returned vector. -/// 11: Bits [255:224] of the source are copied to bits [191:160] of the -/// returned vector. -/// Bits [193:192]: -/// 00: Bits [159:128] of the source are copied to bits [223:192] of the -/// returned vector. -/// 01: Bits [191:160] of the source are copied to bits [223:192] of the -/// returned vector. -/// 10: Bits [223:192] of the source are copied to bits [223:192] of the -/// returned vector. -/// 11: Bits [255:224] of the source are copied to bits [223:192] of the -/// returned vector. -/// Bits [225:224]: -/// 00: Bits [159:128] of the source are copied to bits [255:224] of the -/// returned vector. -/// 01: Bits [191:160] of the source are copied to bits [255:224] of the -/// returned vector. -/// 10: Bits [223:192] of the source are copied to bits [255:224] of the -/// returned vector. -/// 11: Bits [255:224] of the source are copied to bits [255:224] of the -/// returned vector. +/// copied. \n +/// Bits [1:0]: \n +/// 00: Bits [31:0] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// Bits [33:32]: \n +/// 00: Bits [31:0] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// Bits [65:64]: \n +/// 00: Bits [31:0] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// Bits [97:96]: \n +/// 00: Bits [31:0] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// Bits [129:128]: \n +/// 00: Bits [159:128] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// Bits [161:160]: \n +/// 00: Bits [159:128] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// Bits [193:192]: \n +/// 00: Bits [159:128] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// Bits [225:224]: \n +/// 00: Bits [159:128] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [255:224] of the +/// returned vector. /// \returns A 256-bit vector of [8 x float] containing the copied values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c) @@ -971,8 +970,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); } -/// \brief Copies the values in a 128-bit vector of [2 x double] as -/// specified by the immediate integer operand. +/// \brief Copies the values in a 128-bit vector of [2 x double] as specified +/// by the immediate integer operand. /// /// \headerfile <x86intrin.h> /// @@ -980,30 +979,31 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m128d _mm_permute_pd(__m128d A, const int C); /// \endcode /// -/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. +/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. /// /// \param A /// A 128-bit vector of [2 x double]. /// \param C -/// An immediate integer operand specifying how the values are to be copied. -/// Bit [0]: -/// 0: Bits [63:0] of the source are copied to bits [63:0] of the -/// returned vector. -/// 1: Bits [127:64] of the source are copied to bits [63:0] of the -/// returned vector. -/// Bit [1]: -/// 0: Bits [63:0] of the source are copied to bits [127:64] of the -/// returned vector. -/// 1: Bits [127:64] of the source are copied to bits [127:64] of the -/// returned vector. +/// An immediate integer operand specifying how the values are to be +/// copied. \n +/// Bit [0]: \n +/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned +/// vector. \n +/// 1: Bits [127:64] of the source are copied to bits [63:0] of the +/// returned vector. \n +/// Bit [1]: \n +/// 0: Bits [63:0] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// 1: Bits [127:64] of the source are copied to bits [127:64] of the +/// returned vector. /// \returns A 128-bit vector of [2 x double] containing the copied values. #define _mm_permute_pd(A, C) __extension__ ({ \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ (__v2df)_mm_undefined_pd(), \ ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) -/// \brief Copies the values in a 256-bit vector of [4 x double] as -/// specified by the immediate integer operand. +/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by +/// the immediate integer operand. /// /// \headerfile <x86intrin.h> /// @@ -1011,32 +1011,33 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m256d _mm256_permute_pd(__m256d A, const int C); /// \endcode /// -/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction. +/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. /// /// \param A /// A 256-bit vector of [4 x double]. /// \param C -/// An immediate integer operand specifying how the values are to be copied. -/// Bit [0]: -/// 0: Bits [63:0] of the source are copied to bits [63:0] of the -/// returned vector. -/// 1: Bits [127:64] of the source are copied to bits [63:0] of the -/// returned vector. -/// Bit [1]: -/// 0: Bits [63:0] of the source are copied to bits [127:64] of the -/// returned vector. -/// 1: Bits [127:64] of the source are copied to bits [127:64] of the -/// returned vector. -/// Bit [2]: -/// 0: Bits [191:128] of the source are copied to bits [191:128] of the -/// returned vector. -/// 1: Bits [255:192] of the source are copied to bits [191:128] of the -/// returned vector. -/// Bit [3]: -/// 0: Bits [191:128] of the source are copied to bits [255:192] of the -/// returned vector. -/// 1: Bits [255:192] of the source are copied to bits [255:192] of the -/// returned vector. +/// An immediate integer operand specifying how the values are to be +/// copied. \n +/// Bit [0]: \n +/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned +/// vector. \n +/// 1: Bits [127:64] of the source are copied to bits [63:0] of the +/// returned vector. \n +/// Bit [1]: \n +/// 0: Bits [63:0] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// 1: Bits [127:64] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// Bit [2]: \n +/// 0: Bits [191:128] of the source are copied to bits [191:128] of the +/// returned vector. \n +/// 1: Bits [255:192] of the source are copied to bits [191:128] of the +/// returned vector. \n +/// Bit [3]: \n +/// 0: Bits [191:128] of the source are copied to bits [255:192] of the +/// returned vector. \n +/// 1: Bits [255:192] of the source are copied to bits [255:192] of the +/// returned vector. /// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_permute_pd(A, C) __extension__ ({ \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ @@ -1046,8 +1047,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) 2 + (((C) >> 2) & 0x1), \ 2 + (((C) >> 3) & 0x1)); }) -/// \brief Copies the values in a 128-bit vector of [4 x float] as -/// specified by the immediate integer operand. +/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by +/// the immediate integer operand. /// /// \headerfile <x86intrin.h> /// @@ -1055,48 +1056,49 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m128 _mm_permute_ps(__m128 A, const int C); /// \endcode /// -/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. +/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. /// /// \param A /// A 128-bit vector of [4 x float]. /// \param C -/// An immediate integer operand specifying how the values are to be copied. -/// Bits [1:0]: -/// 00: Bits [31:0] of the source are copied to bits [31:0] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [31:0] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [31:0] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [31:0] of the -/// returned vector. -/// Bits [3:2]: -/// 00: Bits [31:0] of the source are copied to bits [63:32] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [63:32] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [63:32] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [63:32] of the -/// returned vector. -/// Bits [5:4]: -/// 00: Bits [31:0] of the source are copied to bits [95:64] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [95:64] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [95:64] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [95:64] of the -/// returned vector. -/// Bits [7:6]: -/// 00: Bits [31:0] of the source are copied to bits [127:96] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [127:96] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [127:96] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [127:96] of the -/// returned vector. +/// An immediate integer operand specifying how the values are to be +/// copied. \n +/// Bits [1:0]: \n +/// 00: Bits [31:0] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// Bits [3:2]: \n +/// 00: Bits [31:0] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// Bits [5:4]: \n +/// 00: Bits [31:0] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// Bits [7:6]: \n +/// 00: Bits [31:0] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [127:96] of the +/// returned vector. /// \returns A 128-bit vector of [4 x float] containing the copied values. #define _mm_permute_ps(A, C) __extension__ ({ \ (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ @@ -1104,8 +1106,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) -/// \brief Copies the values in a 256-bit vector of [8 x float] as -/// specified by the immediate integer operand. +/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by +/// the immediate integer operand. /// /// \headerfile <x86intrin.h> /// @@ -1113,84 +1115,85 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m256 _mm256_permute_ps(__m256 A, const int C); /// \endcode /// -/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. +/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. /// /// \param A /// A 256-bit vector of [8 x float]. /// \param C -/// An immediate integer operand specifying how the values are to be copied. -/// Bits [1:0]: -/// 00: Bits [31:0] of the source are copied to bits [31:0] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [31:0] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [31:0] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [31:0] of the -/// returned vector. -/// Bits [3:2]: -/// 00: Bits [31:0] of the source are copied to bits [63:32] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [63:32] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [63:32] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [63:32] of the -/// returned vector. -/// Bits [5:4]: -/// 00: Bits [31:0] of the source are copied to bits [95:64] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [95:64] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [95:64] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [95:64] of the -/// returned vector. -/// Bits [7:6]: -/// 00: Bits [31:0] of the source are copied to bits [127:96] of the -/// returned vector. -/// 01: Bits [63:32] of the source are copied to bits [127:96] of the -/// returned vector. -/// 10: Bits [95:64] of the source are copied to bits [127:96] of the -/// returned vector. -/// 11: Bits [127:96] of the source are copied to bits [127:96] of the -/// returned vector. -/// Bits [1:0]: -/// 00: Bits [159:128] of the source are copied to bits [159:128] of the -/// returned vector. -/// 01: Bits [191:160] of the source are copied to bits [159:128] of the -/// returned vector. -/// 10: Bits [223:192] of the source are copied to bits [159:128] of the -/// returned vector. -/// 11: Bits [255:224] of the source are copied to bits [159:128] of the -/// returned vector. -/// Bits [3:2]: -/// 00: Bits [159:128] of the source are copied to bits [191:160] of the -/// returned vector. -/// 01: Bits [191:160] of the source are copied to bits [191:160] of the -/// returned vector. -/// 10: Bits [223:192] of the source are copied to bits [191:160] of the -/// returned vector. -/// 11: Bits [255:224] of the source are copied to bits [191:160] of the -/// returned vector. -/// Bits [5:4]: -/// 00: Bits [159:128] of the source are copied to bits [223:192] of the -/// returned vector. -/// 01: Bits [191:160] of the source are copied to bits [223:192] of the -/// returned vector. -/// 10: Bits [223:192] of the source are copied to bits [223:192] of the -/// returned vector. -/// 11: Bits [255:224] of the source are copied to bits [223:192] of the -/// returned vector. -/// Bits [7:6]: -/// 00: Bits [159:128] of the source are copied to bits [255:224] of the -/// returned vector. -/// 01: Bits [191:160] of the source are copied to bits [255:224] of the -/// returned vector. -/// 10: Bits [223:192] of the source are copied to bits [255:224] of the -/// returned vector. -/// 11: Bits [255:224] of the source are copied to bits [255:224] of the -/// returned vector. +/// An immediate integer operand specifying how the values are to be \n +/// copied. \n +/// Bits [1:0]: \n +/// 00: Bits [31:0] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// Bits [3:2]: \n +/// 00: Bits [31:0] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// Bits [5:4]: \n +/// 00: Bits [31:0] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// Bits [7:6]: \n +/// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// Bits [1:0]: \n +/// 00: Bits [159:128] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// Bits [3:2]: \n +/// 00: Bits [159:128] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// Bits [5:4]: \n +/// 00: Bits [159:128] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// Bits [7:6]: \n +/// 00: Bits [159:128] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [255:224] of the +/// returned vector. /// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_permute_ps(A, C) __extension__ ({ \ (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ @@ -1213,7 +1216,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); /// \endcode /// -/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction. +/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. /// /// \param V1 /// A 256-bit vector of [4 x double]. @@ -1221,25 +1224,25 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// A 256-bit vector of [4 x double. /// \param M /// An immediate integer operand specifying how the values are to be -/// permuted. -/// Bits [1:0]: -/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the -/// destination. -/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the -/// destination. -/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the -/// destination. -/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the -/// destination. -/// Bits [5:4]: -/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the -/// destination. -/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the -/// destination. -/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the -/// destination. -/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the -/// destination. +/// permuted. \n +/// Bits [1:0]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// Bits [5:4]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the +/// destination. /// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ @@ -1254,7 +1257,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); /// \endcode /// -/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction. +/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. /// /// \param V1 /// A 256-bit vector of [8 x float]. @@ -1262,24 +1265,24 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// A 256-bit vector of [8 x float]. /// \param M /// An immediate integer operand specifying how the values are to be -/// permuted. -/// Bits [1:0]: -/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the -/// destination. -/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the -/// destination. -/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the -/// destination. -/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the -/// destination. -/// Bits [5:4]: -/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the -/// destination. -/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the -/// destination. -/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the -/// destination. -/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the +/// permuted. \n +/// Bits [1:0]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// Bits [5:4]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the /// destination. /// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ @@ -1295,7 +1298,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); /// \endcode /// -/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction. +/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. /// /// \param V1 /// A 256-bit integer vector. @@ -1303,23 +1306,23 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// A 256-bit integer vector. /// \param M /// An immediate integer operand specifying how the values are to be copied. -/// Bits [1:0]: -/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the -/// destination. -/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the -/// destination. -/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the -/// destination. -/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the -/// destination. -/// Bits [5:4]: -/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the -/// destination. -/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the -/// destination. -/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the -/// destination. -/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the +/// Bits [1:0]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// Bits [5:4]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the /// destination. /// \returns A 256-bit integer vector containing the copied values. #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ @@ -1337,7 +1340,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); /// \endcode /// -/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction. +/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. /// /// \param V1 /// A 256-bit vector of [4 x double]. @@ -1347,9 +1350,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// An immediate integer operand, with mask bits [3:0] specifying how the /// values are to be copied. The position of the mask bit corresponds to the /// index of a copied value. When a mask bit is 0, the corresponding 64-bit -/// element in operand V1 is copied to the same position in the destination. -/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is -/// copied to the same position in the destination. +/// element in operand \a V1 is copied to the same position in the +/// destination. When a mask bit is 1, the corresponding 64-bit element in +/// operand \a V2 is copied to the same position in the destination. /// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ @@ -1369,7 +1372,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); /// \endcode /// -/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction. +/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. /// /// \param V1 /// A 256-bit vector of [8 x float]. @@ -1379,9 +1382,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// An immediate integer operand, with mask bits [7:0] specifying how the /// values are to be copied. The position of the mask bit corresponds to the /// index of a copied value. When a mask bit is 0, the corresponding 32-bit -/// element in operand V1 is copied to the same position in the destination. -/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is -/// copied to the same position in the destination. +/// element in operand \a V1 is copied to the same position in the +/// destination. When a mask bit is 1, the corresponding 32-bit element in +/// operand \a V2 is copied to the same position in the destination. /// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ @@ -1401,7 +1404,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction. +/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double]. @@ -1411,9 +1414,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying /// how the values are to be copied. The position of the mask bit corresponds /// to the most significant bit of a copied value. When a mask bit is 0, the -/// corresponding 64-bit element in operand __a is copied to the same +/// corresponding 64-bit element in operand \a __a is copied to the same /// position in the destination. When a mask bit is 1, the corresponding -/// 64-bit element in operand __b is copied to the same position in the +/// 64-bit element in operand \a __b is copied to the same position in the /// destination. /// \returns A 256-bit vector of [4 x double] containing the copied values. static __inline __m256d __DEFAULT_FN_ATTRS @@ -1429,7 +1432,7 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction. +/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float]. @@ -1439,9 +1442,9 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) /// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, /// and 31 specifying how the values are to be copied. The position of the /// mask bit corresponds to the most significant bit of a copied value. When -/// a mask bit is 0, the corresponding 32-bit element in operand __a is +/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is /// copied to the same position in the destination. When a mask bit is 1, the -/// corresponding 32-bit element in operand __b is copied to the same +/// corresponding 32-bit element in operand \a __b is copied to the same /// position in the destination. /// \returns A 256-bit vector of [8 x float] containing the copied values. static __inline __m256 __DEFAULT_FN_ATTRS @@ -1455,12 +1458,12 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Computes two dot products in parallel, using the lower and upper /// halves of two [8 x float] vectors as input to the two computations, and /// returning the two dot products in the lower and upper halves of the -/// [8 x float] result. The immediate integer operand controls which -/// input elements will contribute to the dot product, and where the final -/// results are returned. In general, for each dot product, the four -/// corresponding elements of the input vectors are multiplied; the first -/// two and second two products are summed, then the two sums are added to -/// form the final result. +/// [8 x float] result. The immediate integer operand controls which input +/// elements will contribute to the dot product, and where the final results +/// are returned. In general, for each dot product, the four corresponding +/// elements of the input vectors are multiplied; the first two and second +/// two products are summed, then the two sums are added to form the final +/// result. /// /// \headerfile <x86intrin.h> /// @@ -1468,7 +1471,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); /// \endcode /// -/// This intrinsic corresponds to the \c VDPPS / DPPS instruction. +/// This intrinsic corresponds to the <c> VDPPS </c> instruction. /// /// \param V1 /// A vector of [8 x float] values, treated as two [4 x float] vectors. @@ -1510,7 +1513,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); /// \endcode /// -/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction. +/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. /// /// \param a /// A 256-bit vector of [8 x float]. The four selected elements in this @@ -1522,22 +1525,23 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// destination, according to the bits specified in the immediate operand. /// \param mask /// An immediate value containing an 8-bit value specifying which elements to -/// copy from a and b. Bits [3:0] specify the values copied from operand a. -/// Bits [7:4] specify the values copied from operand b. +/// copy from \a a and \a b \n. +/// Bits [3:0] specify the values copied from operand \a a. \n +/// Bits [7:4] specify the values copied from operand \a b. \n /// The destinations within the 256-bit destination are assigned values as -/// follows, according to the bit value assignments described below: +/// follows, according to the bit value assignments described below: \n /// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the -/// destination. +/// destination. \n /// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the -/// destination. +/// destination. \n /// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the -/// destination. +/// destination. \n /// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in -/// the destination. -/// Bit value assignments: -/// 00: Bits [31:0] and [159:128] are copied from the selected operand. -/// 01: Bits [63:32] and [191:160] are copied from the selected operand. -/// 10: Bits [95:64] and [223:192] are copied from the selected operand. +/// the destination. \n +/// Bit value assignments: \n +/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n +/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n +/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n /// 11: Bits [127:96] and [255:224] are copied from the selected operand. /// \returns A 256-bit vector of [8 x float] containing the shuffled values. #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ @@ -1567,7 +1571,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); /// \endcode /// -/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction. +/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. /// /// \param a /// A 256-bit vector of [4 x double]. @@ -1575,22 +1579,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// A 256-bit vector of [4 x double]. /// \param mask /// An immediate value containing 8-bit values specifying which elements to -/// copy from a and b: -/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the -/// destination. -/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the -/// destination. -/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the -/// destination. -/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the -/// destination. -/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the -/// destination. -/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the -/// destination. -/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the -/// destination. -/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the +/// copy from \a a and \a b: \n +/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the +/// destination. \n +/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the +/// destination. \n +/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the +/// destination. \n +/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the +/// destination. \n +/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the +/// destination. \n +/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the +/// destination. \n +/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the +/// destination. \n +/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the /// destination. /// \returns A 256-bit vector of [4 x double] containing the shuffled values. #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ @@ -1647,7 +1651,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); /// \endcode /// -/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction. +/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. /// /// \param a /// A 128-bit vector of [2 x double]. @@ -1655,16 +1659,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// A 128-bit vector of [2 x double]. /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: -/// 00h, 08h, 10h, 18h: Equal -/// 01h, 09h, 11h, 19h: Less than -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped -/// operands) -/// 03h, 0Bh, 13h, 1Bh: Unordered -/// 04h, 0Ch, 14h, 1Ch: Not equal -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) +/// operation to use: \n +/// 00h, 08h, 10h, 18h: Equal \n +/// 01h, 09h, 11h, 19h: Less than \n +/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal +/// (swapped operands) \n +/// 03h, 0Bh, 13h, 1Bh: Unordered \n +/// 04h, 0Ch, 14h, 1Ch: Not equal \n +/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than +/// (swapped operands) \n /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) +/// (swapped operands) \n /// 07h, 0Fh, 17h, 1Fh: Ordered /// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_pd(a, b, c) __extension__ ({ \ @@ -1683,7 +1688,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); /// \endcode /// -/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction. +/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. /// /// \param a /// A 128-bit vector of [4 x float]. @@ -1691,16 +1696,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// A 128-bit vector of [4 x float]. /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: -/// 00h, 08h, 10h, 18h: Equal -/// 01h, 09h, 11h, 19h: Less than -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped -/// operands) -/// 03h, 0Bh, 13h, 1Bh: Unordered -/// 04h, 0Ch, 14h, 1Ch: Not equal -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) +/// operation to use: \n +/// 00h, 08h, 10h, 18h: Equal \n +/// 01h, 09h, 11h, 19h: Less than \n +/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal +/// (swapped operands) \n +/// 03h, 0Bh, 13h, 1Bh: Unordered \n +/// 04h, 0Ch, 14h, 1Ch: Not equal \n +/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than +/// (swapped operands) \n /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) +/// (swapped operands) \n /// 07h, 0Fh, 17h, 1Fh: Ordered /// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ps(a, b, c) __extension__ ({ \ @@ -1719,7 +1725,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); /// \endcode /// -/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction. +/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. /// /// \param a /// A 256-bit vector of [4 x double]. @@ -1727,16 +1733,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// A 256-bit vector of [4 x double]. /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: -/// 00h, 08h, 10h, 18h: Equal -/// 01h, 09h, 11h, 19h: Less than -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped -/// operands) -/// 03h, 0Bh, 13h, 1Bh: Unordered -/// 04h, 0Ch, 14h, 1Ch: Not equal -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) +/// operation to use: \n +/// 00h, 08h, 10h, 18h: Equal \n +/// 01h, 09h, 11h, 19h: Less than \n +/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal +/// (swapped operands) \n +/// 03h, 0Bh, 13h, 1Bh: Unordered \n +/// 04h, 0Ch, 14h, 1Ch: Not equal \n +/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than +/// (swapped operands) \n /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) +/// (swapped operands) \n /// 07h, 0Fh, 17h, 1Fh: Ordered /// \returns A 256-bit vector of [4 x double] containing the comparison results. #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ @@ -1755,7 +1762,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); /// \endcode /// -/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction. +/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. /// /// \param a /// A 256-bit vector of [8 x float]. @@ -1763,16 +1770,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// A 256-bit vector of [8 x float]. /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: -/// 00h, 08h, 10h, 18h: Equal -/// 01h, 09h, 11h, 19h: Less than -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped -/// operands) -/// 03h, 0Bh, 13h, 1Bh: Unordered -/// 04h, 0Ch, 14h, 1Ch: Not equal -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) +/// operation to use: \n +/// 00h, 08h, 10h, 18h: Equal \n +/// 01h, 09h, 11h, 19h: Less than \n +/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal +/// (swapped operands) \n +/// 03h, 0Bh, 13h, 1Bh: Unordered \n +/// 04h, 0Ch, 14h, 1Ch: Not equal \n +/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than +/// (swapped operands) \n /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) +/// (swapped operands) \n /// 07h, 0Fh, 17h, 1Fh: Ordered /// \returns A 256-bit vector of [8 x float] containing the comparison results. #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ @@ -1790,7 +1798,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); /// \endcode /// -/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction. +/// This intrinsic corresponds to the <c> VCMPSD </c> instruction. /// /// \param a /// A 128-bit vector of [2 x double]. @@ -1798,16 +1806,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// A 128-bit vector of [2 x double]. /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: -/// 00h, 08h, 10h, 18h: Equal -/// 01h, 09h, 11h, 19h: Less than -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped -/// operands) -/// 03h, 0Bh, 13h, 1Bh: Unordered -/// 04h, 0Ch, 14h, 1Ch: Not equal -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) +/// operation to use: \n +/// 00h, 08h, 10h, 18h: Equal \n +/// 01h, 09h, 11h, 19h: Less than \n +/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal +/// (swapped operands) \n +/// 03h, 0Bh, 13h, 1Bh: Unordered \n +/// 04h, 0Ch, 14h, 1Ch: Not equal \n +/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than +/// (swapped operands) \n /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) +/// (swapped operands) \n /// 07h, 0Fh, 17h, 1Fh: Ordered /// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_sd(a, b, c) __extension__ ({ \ @@ -1825,7 +1834,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); /// \endcode /// -/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction. +/// This intrinsic corresponds to the <c> VCMPSS </c> instruction. /// /// \param a /// A 128-bit vector of [4 x float]. @@ -1833,16 +1842,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// A 128-bit vector of [4 x float]. /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: -/// 00h, 08h, 10h, 18h: Equal -/// 01h, 09h, 11h, 19h: Less than -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped -/// operands) -/// 03h, 0Bh, 13h, 1Bh: Unordered -/// 04h, 0Ch, 14h, 1Ch: Not equal -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands) +/// operation to use: \n +/// 00h, 08h, 10h, 18h: Equal \n +/// 01h, 09h, 11h, 19h: Less than \n +/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal +/// (swapped operands) \n +/// 03h, 0Bh, 13h, 1Bh: Unordered \n +/// 04h, 0Ch, 14h, 1Ch: Not equal \n +/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than +/// (swapped operands) \n /// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) +/// (swapped operands) \n /// 07h, 0Fh, 17h, 1Fh: Ordered /// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ss(a, b, c) __extension__ ({ \ @@ -1854,8 +1864,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE / -/// EXTRACTF128+COMPOSITE instruction. +/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> +/// instruction. /// /// \param __a /// A 256-bit vector of [8 x i32]. @@ -1876,8 +1886,8 @@ _mm256_extract_epi32(__m256i __a, const int __imm) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE / -/// EXTRACTF128+COMPOSITE instruction. +/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> +/// instruction. /// /// \param __a /// A 256-bit integer vector of [16 x i16]. @@ -1898,8 +1908,8 @@ _mm256_extract_epi16(__m256i __a, const int __imm) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE / -/// EXTRACTF128+COMPOSITE instruction. +/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> +/// instruction. /// /// \param __a /// A 256-bit integer vector of [32 x i8]. @@ -1921,8 +1931,8 @@ _mm256_extract_epi8(__m256i __a, const int __imm) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE / -/// EXTRACTF128+COMPOSITE instruction. +/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> +/// instruction. /// /// \param __a /// A 256-bit integer vector of [4 x i64]. @@ -1945,8 +1955,8 @@ _mm256_extract_epi64(__m256i __a, const int __imm) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE / -/// INSERTF128+COMPOSITE instruction. +/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> +/// instruction. /// /// \param __a /// A vector of [8 x i32] to be used by the insert operation. @@ -1955,8 +1965,8 @@ _mm256_extract_epi64(__m256i __a, const int __imm) /// \param __imm /// An immediate integer specifying the index of the vector element to be /// replaced. -/// \returns A copy of vector __a, after replacing its element indexed by __imm -/// with __b. +/// \returns A copy of vector \a __a, after replacing its element indexed by +/// \a __imm with \a __b. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_insert_epi32(__m256i __a, int __b, int const __imm) { @@ -1972,8 +1982,8 @@ _mm256_insert_epi32(__m256i __a, int __b, int const __imm) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE / -/// INSERTF128+COMPOSITE instruction. +/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> +/// instruction. /// /// \param __a /// A vector of [16 x i16] to be used by the insert operation. @@ -1982,8 +1992,8 @@ _mm256_insert_epi32(__m256i __a, int __b, int const __imm) /// \param __imm /// An immediate integer specifying the index of the vector element to be /// replaced. -/// \returns A copy of vector __a, after replacing its element indexed by __imm -/// with __b. +/// \returns A copy of vector \a __a, after replacing its element indexed by +/// \a __imm with \a __b. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_insert_epi16(__m256i __a, int __b, int const __imm) { @@ -1998,8 +2008,8 @@ _mm256_insert_epi16(__m256i __a, int __b, int const __imm) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE / -/// INSERTF128+COMPOSITE instruction. +/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> +/// instruction. /// /// \param __a /// A vector of [32 x i8] to be used by the insert operation. @@ -2008,8 +2018,8 @@ _mm256_insert_epi16(__m256i __a, int __b, int const __imm) /// \param __imm /// An immediate integer specifying the index of the vector element to be /// replaced. -/// \returns A copy of vector __a, after replacing its element indexed by __imm -/// with __b. +/// \returns A copy of vector \a __a, after replacing its element indexed by +/// \a __imm with \a __b. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_insert_epi8(__m256i __a, int __b, int const __imm) { @@ -2025,8 +2035,8 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE / -/// INSERTF128+COMPOSITE instruction. +/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> +/// instruction. /// /// \param __a /// A vector of [4 x i64] to be used by the insert operation. @@ -2035,8 +2045,8 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm) /// \param __imm /// An immediate integer specifying the index of the vector element to be /// replaced. -/// \returns A copy of vector __a, after replacing its element indexed by __imm -/// with __b. +/// \returns A copy of vector \a __a, after replacing its element indexed by +/// \a __imm with \a __b. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_insert_epi64(__m256i __a, long long __b, int const __imm) { @@ -2051,7 +2061,7 @@ _mm256_insert_epi64(__m256i __a, long long __b, int const __imm) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction. +/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. /// /// \param __a /// A 128-bit integer vector of [4 x i32]. @@ -2066,7 +2076,7 @@ _mm256_cvtepi32_pd(__m128i __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction. +/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. /// /// \param __a /// A 256-bit integer vector. @@ -2082,7 +2092,7 @@ _mm256_cvtepi32_ps(__m256i __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction. +/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. /// /// \param __a /// A 256-bit vector of [4 x double]. @@ -2097,7 +2107,7 @@ _mm256_cvtpd_ps(__m256d __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction. +/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. /// /// \param __a /// A 256-bit vector of [8 x float]. @@ -2108,24 +2118,66 @@ _mm256_cvtps_epi32(__m256 __a) return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); } +/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 +/// x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit vector of [4 x double] containing the converted values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a) { return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); } +/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 +/// x i32], truncating the result by rounding towards zero when it is +/// inexact. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 128-bit integer vector containing the converted values. static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a) { return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); } +/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 +/// x i32]. When a conversion is inexact, the value returned is rounded +/// according to the rounding control bits in the MXCSR register. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 128-bit integer vector containing the converted values. static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a) { return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); } +/// \brief Converts a vector of [8 x float] into a vector of [8 x i32], +/// truncating the result by rounding towards zero when it is inexact. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit integer vector containing the converted values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a) { @@ -2152,18 +2204,73 @@ _mm256_cvtss_f32(__m256 __a) } /* Vector replicate */ +/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit +/// vector of [8 x float] to float values in a 256-bit vector of +/// [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. \n +/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of +/// the return value. \n +/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of +/// the return value. \n +/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the +/// return value. \n +/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the +/// return value. +/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated +/// values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); } +/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit +/// vector of [8 x float] to float values in a 256-bit vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. \n +/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of +/// the return value. \n +/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of +/// the return value. \n +/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the +/// return value. \n +/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the +/// return value. +/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated +/// values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); } +/// \brief Moves and duplicates double-precision floating point values from a +/// 256-bit vector of [4 x double] to double-precision values in a 256-bit +/// vector of [4 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. \n +/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the +/// return value. \n +/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of +/// the return value. +/// \returns A 256-bit vector of [4 x double] containing the moved and +/// duplicated values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a) { @@ -2171,24 +2278,98 @@ _mm256_movedup_pd(__m256d __a) } /* Unpack and Interleave */ +/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of +/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. \n +/// Bits [127:64] are written to bits [63:0] of the return value. \n +/// Bits [255:192] are written to bits [191:128] of the return value. \n +/// \param __b +/// A 256-bit floating-point vector of [4 x double]. \n +/// Bits [127:64] are written to bits [127:64] of the return value. \n +/// Bits [255:192] are written to bits [255:192] of the return value. \n +/// \returns A 256-bit vector of [4 x double] containing the interleaved values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b) { return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); } +/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of +/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. \n +/// Bits [63:0] are written to bits [63:0] of the return value. \n +/// Bits [191:128] are written to bits [191:128] of the return value. +/// \param __b +/// A 256-bit floating-point vector of [4 x double]. \n +/// Bits [63:0] are written to bits [127:64] of the return value. \n +/// Bits [191:128] are written to bits [255:192] of the return value. \n +/// \returns A 256-bit vector of [4 x double] containing the interleaved values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b) { return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); } +/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the +/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit +/// vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. \n +/// Bits [95:64] are written to bits [31:0] of the return value. \n +/// Bits [127:96] are written to bits [95:64] of the return value. \n +/// Bits [223:192] are written to bits [159:128] of the return value. \n +/// Bits [255:224] are written to bits [223:192] of the return value. +/// \param __b +/// A 256-bit vector of [8 x float]. \n +/// Bits [95:64] are written to bits [63:32] of the return value. \n +/// Bits [127:96] are written to bits [127:96] of the return value. \n +/// Bits [223:192] are written to bits [191:160] of the return value. \n +/// Bits [255:224] are written to bits [255:224] of the return value. +/// \returns A 256-bit vector of [8 x float] containing the interleaved values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); } +/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the +/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit +/// vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. \n +/// Bits [31:0] are written to bits [31:0] of the return value. \n +/// Bits [63:32] are written to bits [95:64] of the return value. \n +/// Bits [159:128] are written to bits [159:128] of the return value. \n +/// Bits [191:160] are written to bits [223:192] of the return value. +/// \param __b +/// A 256-bit vector of [8 x float]. \n +/// Bits [31:0] are written to bits [63:32] of the return value. \n +/// Bits [63:32] are written to bits [127:96] of the return value. \n +/// Bits [159:128] are written to bits [191:160] of the return value. \n +/// Bits [191:160] are written to bits [255:224] of the return value. +/// \returns A 256-bit vector of [8 x float] containing the interleaved values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b) { @@ -2196,90 +2377,401 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b) } /* Bit Test */ +/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an +/// element-by-element comparison of the double-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns the ZF flag in the EFLAGS register. static __inline int __DEFAULT_FN_ATTRS _mm_testz_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); } +/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an +/// element-by-element comparison of the double-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns the CF flag in the EFLAGS register. static __inline int __DEFAULT_FN_ATTRS _mm_testc_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); } +/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an +/// element-by-element comparison of the double-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. static __inline int __DEFAULT_FN_ATTRS _mm_testnzc_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); } +/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns the ZF flag. static __inline int __DEFAULT_FN_ATTRS _mm_testz_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); } +/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns the CF flag. static __inline int __DEFAULT_FN_ATTRS _mm_testc_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); } +/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. static __inline int __DEFAULT_FN_ATTRS _mm_testnzc_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); } +/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an +/// element-by-element comparison of the double-precision elements in the +/// first source vector and the corresponding elements in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// \returns the ZF flag. static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b) { return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); } +/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an +/// element-by-element comparison of the double-precision elements in the +/// first source vector and the corresponding elements in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// \returns the CF flag. static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b) { return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); } +/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an +/// element-by-element comparison of the double-precision elements in the +/// first source vector and the corresponding elements in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b) { return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); } +/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// \returns the ZF flag. static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b) { return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// \returns the CF flag. static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b) { return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an +/// element-by-element comparison of the single-precision elements in the +/// first source vector and the corresponding elements in the second source +/// vector. The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b) { return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// of the two source vectors and update the EFLAGS register as follows: \n +/// If there is at least one pair of bits where both bits are 1, the ZF flag +/// is set to 0. Otherwise the ZF flag is set to 1. \n +/// If there is at least one pair of bits where the bit from the first source +/// vector is 0 and the bit from the second source vector is 1, the CF flag +/// is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPTEST </c> instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns the ZF flag. static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b) { return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); } +/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// of the two source vectors and update the EFLAGS register as follows: \n +/// If there is at least one pair of bits where both bits are 1, the ZF flag +/// is set to 0. Otherwise the ZF flag is set to 1. \n +/// If there is at least one pair of bits where the bit from the first source +/// vector is 0 and the bit from the second source vector is 1, the CF flag +/// is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPTEST </c> instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns the CF flag. static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b) { return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); } +/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// of the two source vectors and update the EFLAGS register as follows: \n +/// If there is at least one pair of bits where both bits are 1, the ZF flag +/// is set to 0. Otherwise the ZF flag is set to 1. \n +/// If there is at least one pair of bits where the bit from the first source +/// vector is 0 and the bit from the second source vector is 1, the CF flag +/// is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPTEST </c> instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b) { @@ -2287,12 +2779,36 @@ _mm256_testnzc_si256(__m256i __a, __m256i __b) } /* Vector extract sign mask */ +/// \brief Extracts the sign bits of double-precision floating point elements +/// in a 256-bit vector of [4 x double] and writes them to the lower order +/// bits of the return value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the double-precision +/// floating point values with sign bits to be extracted. +/// \returns The sign bits from the operand, written to bits [3:0]. static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a) { return __builtin_ia32_movmskpd256((__v4df)__a); } +/// \brief Extracts the sign bits of double-precision floating point elements +/// in a 256-bit vector of [8 x float] and writes them to the lower order +/// bits of the return value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the double-precision floating +/// point values with sign bits to be extracted. +/// \returns The sign bits from the operand, written to bits [7:0]. static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a) { @@ -2300,12 +2816,22 @@ _mm256_movemask_ps(__m256 __a) } /* Vector __zero */ +/// \brief Zeroes the contents of all XMM or YMM registers. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VZEROALL </c> instruction. static __inline void __DEFAULT_FN_ATTRS _mm256_zeroall(void) { __builtin_ia32_vzeroall(); } +/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. static __inline void __DEFAULT_FN_ATTRS _mm256_zeroupper(void) { @@ -2313,6 +2839,18 @@ _mm256_zeroupper(void) } /* Vector load with broadcast */ +/// \brief Loads a scalar single-precision floating point value from the +/// specified address pointed to by \a __a and broadcasts it to the elements +/// of a [4 x float] vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. +/// +/// \param __a +/// The single-precision floating point value to be broadcast. +/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set +/// equal to the broadcast value. static __inline __m128 __DEFAULT_FN_ATTRS _mm_broadcast_ss(float const *__a) { @@ -2320,6 +2858,18 @@ _mm_broadcast_ss(float const *__a) return (__m128)(__v4sf){ __f, __f, __f, __f }; } +/// \brief Loads a scalar double-precision floating point value from the +/// specified address pointed to by \a __a and broadcasts it to the elements +/// of a [4 x double] vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. +/// +/// \param __a +/// The double-precision floating point value to be broadcast. +/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set +/// equal to the broadcast value. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a) { @@ -2327,6 +2877,18 @@ _mm256_broadcast_sd(double const *__a) return (__m256d)(__v4df){ __d, __d, __d, __d }; } +/// \brief Loads a scalar single-precision floating point value from the +/// specified address pointed to by \a __a and broadcasts it to the elements +/// of a [8 x float] vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. +/// +/// \param __a +/// The single-precision floating point value to be broadcast. +/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set +/// equal to the broadcast value. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a) { @@ -2334,12 +2896,36 @@ _mm256_broadcast_ss(float const *__a) return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; } +/// \brief Loads the data from a 128-bit vector of [2 x double] from the +/// specified address pointed to by \a __a and broadcasts it to 128-bit +/// elements in a 256-bit vector of [4 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. +/// +/// \param __a +/// The 128-bit vector of [2 x double] to be broadcast. +/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set +/// equal to the broadcast value. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a) { return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); } +/// \brief Loads the data from a 128-bit vector of [4 x float] from the +/// specified address pointed to by \a __a and broadcasts it to 128-bit +/// elements in a 256-bit vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. +/// +/// \param __a +/// The 128-bit vector of [4 x float] to be broadcast. +/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set +/// equal to the broadcast value. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a) { @@ -2347,18 +2933,50 @@ _mm256_broadcast_ps(__m128 const *__a) } /* SIMD load ops */ +/// \brief Loads 4 double-precision floating point values from a 32-byte aligned +/// memory location pointed to by \a __p into a vector of [4 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location containing +/// double-precision floating point values. +/// \returns A 256-bit vector of [4 x double] containing the moved values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p) { return *(__m256d *)__p; } +/// \brief Loads 8 single-precision floating point values from a 32-byte aligned +/// memory location pointed to by \a __p into a vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location containing float values. +/// \returns A 256-bit vector of [8 x float] containing the moved values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p) { return *(__m256 *)__p; } +/// \brief Loads 4 double-precision floating point values from an unaligned +/// memory location pointed to by \a __p into a vector of [4 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. +/// +/// \param __p +/// A pointer to a memory location containing double-precision floating +/// point values. +/// \returns A 256-bit vector of [4 x double] containing the moved values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p) { @@ -2368,6 +2986,17 @@ _mm256_loadu_pd(double const *__p) return ((struct __loadu_pd*)__p)->__v; } +/// \brief Loads 8 single-precision floating point values from an unaligned +/// memory location pointed to by \a __p into a vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. +/// +/// \param __p +/// A pointer to a memory location containing single-precision floating +/// point values. +/// \returns A 256-bit vector of [8 x float] containing the moved values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p) { @@ -2377,12 +3006,33 @@ _mm256_loadu_ps(float const *__p) return ((struct __loadu_ps*)__p)->__v; } +/// \brief Loads 256 bits of integer data from a 32-byte aligned memory +/// location pointed to by \a __p into elements of a 256-bit integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a 256-bit integer vector containing integer +/// values. +/// \returns A 256-bit integer vector containing the moved values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p) { return *__p; } +/// \brief Loads 256 bits of integer data from an unaligned memory location +/// pointed to by \a __p into a 256-bit integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. +/// +/// \param __p +/// A pointer to a 256-bit integer vector containing integer values. +/// \returns A 256-bit integer vector containing the moved values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i const *__p) { @@ -2392,6 +3042,18 @@ _mm256_loadu_si256(__m256i const *__p) return ((struct __loadu_si256*)__p)->__v; } +/// \brief Loads 256 bits of integer data from an unaligned memory location +/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may +/// perform better than \c _mm256_loadu_si256 when the data crosses a cache +/// line boundary. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. +/// +/// \param __p +/// A pointer to a 256-bit integer vector containing integer values. +/// \returns A 256-bit integer vector containing the moved values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i const *__p) { @@ -2399,18 +3061,55 @@ _mm256_lddqu_si256(__m256i const *__p) } /* SIMD store ops */ +/// \brief Stores double-precision floating point values from a 256-bit vector +/// of [4 x double] to a 32-byte aligned memory location pointed to by +/// \a __p. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will receive the +/// double-precision floaing point values. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a) { *(__m256d *)__p = __a; } +/// \brief Stores single-precision floating point values from a 256-bit vector +/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will receive the +/// float values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a) { *(__m256 *)__p = __a; } +/// \brief Stores double-precision floating point values from a 256-bit vector +/// of [4 x double] to an unaligned memory location pointed to by \a __p. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the double-precision +/// floating point values. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a) { @@ -2420,6 +3119,17 @@ _mm256_storeu_pd(double *__p, __m256d __a) ((struct __storeu_pd*)__p)->__v = __a; } +/// \brief Stores single-precision floating point values from a 256-bit vector +/// of [8 x float] to an unaligned memory location pointed to by \a __p. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a) { @@ -2429,12 +3139,35 @@ _mm256_storeu_ps(float *__p, __m256 __a) ((struct __storeu_ps*)__p)->__v = __a; } +/// \brief Stores integer values from a 256-bit integer vector to a 32-byte +/// aligned memory location pointed to by \a __p. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will receive the +/// integer values. +/// \param __a +/// A 256-bit integer vector containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a) { *__p = __a; } +/// \brief Stores integer values from a 256-bit integer vector to an unaligned +/// memory location pointed to by \a __p. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the integer values. +/// \param __a +/// A 256-bit integer vector containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i *__p, __m256i __a) { @@ -2445,12 +3178,48 @@ _mm256_storeu_si256(__m256i *__p, __m256i __a) } /* Conditional load ops */ +/// \brief Conditionally loads double-precision floating point elements from a +/// memory location pointed to by \a __p into a 128-bit vector of +/// [2 x double], depending on the mask bits associated with each data +/// element. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the double-precision +/// floating point values. +/// \param __m +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each data element represents the mask bits. If a mask bit is zero, the +/// corresponding value in the memory location is not loaded and the +/// corresponding field in the return value is set to zero. +/// \returns A 128-bit vector of [2 x double] containing the loaded values. static __inline __m128d __DEFAULT_FN_ATTRS _mm_maskload_pd(double const *__p, __m128i __m) { return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); } +/// \brief Conditionally loads double-precision floating point elements from a +/// memory location pointed to by \a __p into a 256-bit vector of +/// [4 x double], depending on the mask bits associated with each data +/// element. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the double-precision +/// floating point values. +/// \param __m +/// A 256-bit integer vector of [4 x quadword] containing the mask. The most +/// significant bit of each quadword element represents the mask bits. If a +/// mask bit is zero, the corresponding value in the memory location is not +/// loaded and the corresponding field in the return value is set to zero. +/// \returns A 256-bit vector of [4 x double] containing the loaded values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m) { @@ -2458,12 +3227,48 @@ _mm256_maskload_pd(double const *__p, __m256i __m) (__v4di)__m); } +/// \brief Conditionally loads single-precision floating point elements from a +/// memory location pointed to by \a __p into a 128-bit vector of +/// [4 x float], depending on the mask bits associated with each data +/// element. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the single-precision +/// floating point values. +/// \param __m +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each data element represents the mask bits. If a mask bit is zero, the +/// corresponding value in the memory location is not loaded and the +/// corresponding field in the return value is set to zero. +/// \returns A 128-bit vector of [4 x float] containing the loaded values. static __inline __m128 __DEFAULT_FN_ATTRS _mm_maskload_ps(float const *__p, __m128i __m) { return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); } +/// \brief Conditionally loads single-precision floating point elements from a +/// memory location pointed to by \a __p into a 256-bit vector of +/// [8 x float], depending on the mask bits associated with each data +/// element. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the single-precision +/// floating point values. +/// \param __m +/// A 256-bit integer vector of [8 x dword] containing the mask. The most +/// significant bit of each dword element represents the mask bits. If a mask +/// bit is zero, the corresponding value in the memory location is not loaded +/// and the corresponding field in the return value is set to zero. +/// \returns A 256-bit vector of [8 x float] containing the loaded values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m) { @@ -2471,24 +3276,96 @@ _mm256_maskload_ps(float const *__p, __m256i __m) } /* Conditional store ops */ +/// \brief Moves single-precision floating point values from a 256-bit vector +/// of [8 x float] to a memory location pointed to by \a __p, according to +/// the specified mask. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __m +/// A 256-bit integer vector of [8 x dword] containing the mask. The most +/// significant bit of each dword element in the mask vector represents the +/// mask bits. If a mask bit is zero, the corresponding value from vector +/// \a __a is not stored and the corresponding field in the memory location +/// pointed to by \a __p is not changed. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be stored. static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) { __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); } +/// \brief Moves double-precision values from a 128-bit vector of [2 x double] +/// to a memory location pointed to by \a __p, according to the specified +/// mask. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __m +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each field in the mask vector represents the mask bits. If a mask bit is +/// zero, the corresponding value from vector \a __a is not stored and the +/// corresponding field in the memory location pointed to by \a __p is not +/// changed. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be stored. static __inline void __DEFAULT_FN_ATTRS _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) { __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); } +/// \brief Moves double-precision values from a 256-bit vector of [4 x double] +/// to a memory location pointed to by \a __p, according to the specified +/// mask. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __m +/// A 256-bit integer vector of [4 x quadword] containing the mask. The most +/// significant bit of each quadword element in the mask vector represents +/// the mask bits. If a mask bit is zero, the corresponding value from vector +/// __a is not stored and the corresponding field in the memory location +/// pointed to by \a __p is not changed. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be stored. static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) { __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); } +/// \brief Moves single-precision floating point values from a 128-bit vector +/// of [4 x float] to a memory location pointed to by \a __p, according to +/// the specified mask. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __m +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each field in the mask vector represents the mask bits. If a mask bit is +/// zero, the corresponding value from vector __a is not stored and the +/// corresponding field in the memory location pointed to by \a __p is not +/// changed. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be stored. static __inline void __DEFAULT_FN_ATTRS _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) { @@ -2496,18 +3373,58 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) } /* Cacheability support ops */ +/// \brief Moves integer data from a 256-bit integer vector to a 32-byte +/// aligned memory location. To minimize caching, the data is flagged as +/// non-temporal (unlikely to be used again soon). +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. +/// +/// \param __a +/// A pointer to a 32-byte aligned memory location that will receive the +/// integer values. +/// \param __b +/// A 256-bit integer vector containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(__m256i *__a, __m256i __b) { __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); } +/// \brief Moves double-precision values from a 256-bit vector of [4 x double] +/// to a 32-byte aligned memory location. To minimize caching, the data is +/// flagged as non-temporal (unlikely to be used again soon). +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. +/// +/// \param __a +/// A pointer to a 32-byte aligned memory location that will receive the +/// integer values. +/// \param __b +/// A 256-bit vector of [4 x double] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(double *__a, __m256d __b) { __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); } +/// \brief Moves single-precision floating point values from a 256-bit vector +/// of [8 x float] to a 32-byte aligned memory location. To minimize +/// caching, the data is flagged as non-temporal (unlikely to be used again +/// soon). +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. +/// +/// \param __p +/// A pointer to a 32-byte aligned memory location that will receive the +/// single-precision floating point values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(float *__p, __m256 __a) { @@ -2515,30 +3432,105 @@ _mm256_stream_ps(float *__p, __m256 __a) } /* Create vectors */ +/// \brief Create a 256-bit vector of [4 x double] with undefined values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 256-bit vector of [4 x double] containing undefined values. static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void) { return (__m256d)__builtin_ia32_undef256(); } +/// \brief Create a 256-bit vector of [8 x float] with undefined values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 256-bit vector of [8 x float] containing undefined values. static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void) { return (__m256)__builtin_ia32_undef256(); } +/// \brief Create a 256-bit integer vector with undefined values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 256-bit integer vector containing undefined values. static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void) { return (__m256i)__builtin_ia32_undef256(); } +/// \brief Constructs a 256-bit floating-point vector of [4 x double] +/// initialized with the specified double-precision floating-point values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> +/// instruction. +/// +/// \param __a +/// A double-precision floating-point value used to initialize bits [255:192] +/// of the result. +/// \param __b +/// A double-precision floating-point value used to initialize bits [191:128] +/// of the result. +/// \param __c +/// A double-precision floating-point value used to initialize bits [127:64] +/// of the result. +/// \param __d +/// A double-precision floating-point value used to initialize bits [63:0] +/// of the result. +/// \returns An initialized 256-bit floating-point vector of [4 x double]. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d) { return (__m256d){ __d, __c, __b, __a }; } +/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized +/// with the specified single-precision floating-point values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A single-precision floating-point value used to initialize bits [255:224] +/// of the result. +/// \param __b +/// A single-precision floating-point value used to initialize bits [223:192] +/// of the result. +/// \param __c +/// A single-precision floating-point value used to initialize bits [191:160] +/// of the result. +/// \param __d +/// A single-precision floating-point value used to initialize bits [159:128] +/// of the result. +/// \param __e +/// A single-precision floating-point value used to initialize bits [127:96] +/// of the result. +/// \param __f +/// A single-precision floating-point value used to initialize bits [95:64] +/// of the result. +/// \param __g +/// A single-precision floating-point value used to initialize bits [63:32] +/// of the result. +/// \param __h +/// A single-precision floating-point value used to initialize bits [31:0] +/// of the result. +/// \returns An initialized 256-bit floating-point vector of [8 x float]. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h) @@ -2546,6 +3538,31 @@ _mm256_set_ps(float __a, float __b, float __c, float __d, return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; } +/// \brief Constructs a 256-bit integer vector initialized with the specified +/// 32-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i0 +/// A 32-bit integral value used to initialize bits [255:224] of the result. +/// \param __i1 +/// A 32-bit integral value used to initialize bits [223:192] of the result. +/// \param __i2 +/// A 32-bit integral value used to initialize bits [191:160] of the result. +/// \param __i3 +/// A 32-bit integral value used to initialize bits [159:128] of the result. +/// \param __i4 +/// A 32-bit integral value used to initialize bits [127:96] of the result. +/// \param __i5 +/// A 32-bit integral value used to initialize bits [95:64] of the result. +/// \param __i6 +/// A 32-bit integral value used to initialize bits [63:32] of the result. +/// \param __i7 +/// A 32-bit integral value used to initialize bits [31:0] of the result. +/// \returns An initialized 256-bit integer vector. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7) @@ -2553,6 +3570,47 @@ _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; } +/// \brief Constructs a 256-bit integer vector initialized with the specified +/// 16-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w15 +/// A 16-bit integral value used to initialize bits [255:240] of the result. +/// \param __w14 +/// A 16-bit integral value used to initialize bits [239:224] of the result. +/// \param __w13 +/// A 16-bit integral value used to initialize bits [223:208] of the result. +/// \param __w12 +/// A 16-bit integral value used to initialize bits [207:192] of the result. +/// \param __w11 +/// A 16-bit integral value used to initialize bits [191:176] of the result. +/// \param __w10 +/// A 16-bit integral value used to initialize bits [175:160] of the result. +/// \param __w09 +/// A 16-bit integral value used to initialize bits [159:144] of the result. +/// \param __w08 +/// A 16-bit integral value used to initialize bits [143:128] of the result. +/// \param __w07 +/// A 16-bit integral value used to initialize bits [127:112] of the result. +/// \param __w06 +/// A 16-bit integral value used to initialize bits [111:96] of the result. +/// \param __w05 +/// A 16-bit integral value used to initialize bits [95:80] of the result. +/// \param __w04 +/// A 16-bit integral value used to initialize bits [79:64] of the result. +/// \param __w03 +/// A 16-bit integral value used to initialize bits [63:48] of the result. +/// \param __w02 +/// A 16-bit integral value used to initialize bits [47:32] of the result. +/// \param __w01 +/// A 16-bit integral value used to initialize bits [31:16] of the result. +/// \param __w00 +/// A 16-bit integral value used to initialize bits [15:0] of the result. +/// \returns An initialized 256-bit integer vector. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, @@ -2563,6 +3621,79 @@ _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; } +/// \brief Constructs a 256-bit integer vector initialized with the specified +/// 8-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b31 +/// An 8-bit integral value used to initialize bits [255:248] of the result. +/// \param __b30 +/// An 8-bit integral value used to initialize bits [247:240] of the result. +/// \param __b29 +/// An 8-bit integral value used to initialize bits [239:232] of the result. +/// \param __b28 +/// An 8-bit integral value used to initialize bits [231:224] of the result. +/// \param __b27 +/// An 8-bit integral value used to initialize bits [223:216] of the result. +/// \param __b26 +/// An 8-bit integral value used to initialize bits [215:208] of the result. +/// \param __b25 +/// An 8-bit integral value used to initialize bits [207:200] of the result. +/// \param __b24 +/// An 8-bit integral value used to initialize bits [199:192] of the result. +/// \param __b23 +/// An 8-bit integral value used to initialize bits [191:184] of the result. +/// \param __b22 +/// An 8-bit integral value used to initialize bits [183:176] of the result. +/// \param __b21 +/// An 8-bit integral value used to initialize bits [175:168] of the result. +/// \param __b20 +/// An 8-bit integral value used to initialize bits [167:160] of the result. +/// \param __b19 +/// An 8-bit integral value used to initialize bits [159:152] of the result. +/// \param __b18 +/// An 8-bit integral value used to initialize bits [151:144] of the result. +/// \param __b17 +/// An 8-bit integral value used to initialize bits [143:136] of the result. +/// \param __b16 +/// An 8-bit integral value used to initialize bits [135:128] of the result. +/// \param __b15 +/// An 8-bit integral value used to initialize bits [127:120] of the result. +/// \param __b14 +/// An 8-bit integral value used to initialize bits [119:112] of the result. +/// \param __b13 +/// An 8-bit integral value used to initialize bits [111:104] of the result. +/// \param __b12 +/// An 8-bit integral value used to initialize bits [103:96] of the result. +/// \param __b11 +/// An 8-bit integral value used to initialize bits [95:88] of the result. +/// \param __b10 +/// An 8-bit integral value used to initialize bits [87:80] of the result. +/// \param __b09 +/// An 8-bit integral value used to initialize bits [79:72] of the result. +/// \param __b08 +/// An 8-bit integral value used to initialize bits [71:64] of the result. +/// \param __b07 +/// An 8-bit integral value used to initialize bits [63:56] of the result. +/// \param __b06 +/// An 8-bit integral value used to initialize bits [55:48] of the result. +/// \param __b05 +/// An 8-bit integral value used to initialize bits [47:40] of the result. +/// \param __b04 +/// An 8-bit integral value used to initialize bits [39:32] of the result. +/// \param __b03 +/// An 8-bit integral value used to initialize bits [31:24] of the result. +/// \param __b02 +/// An 8-bit integral value used to initialize bits [23:16] of the result. +/// \param __b01 +/// An 8-bit integral value used to initialize bits [15:8] of the result. +/// \param __b00 +/// An 8-bit integral value used to initialize bits [7:0] of the result. +/// \returns An initialized 256-bit integer vector. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, @@ -2581,6 +3712,23 @@ _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, }; } +/// \brief Constructs a 256-bit integer vector initialized with the specified +/// 64-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> +/// instruction. +/// +/// \param __a +/// A 64-bit integral value used to initialize bits [255:192] of the result. +/// \param __b +/// A 64-bit integral value used to initialize bits [191:128] of the result. +/// \param __c +/// A 64-bit integral value used to initialize bits [127:64] of the result. +/// \param __d +/// A 64-bit integral value used to initialize bits [63:0] of the result. +/// \returns An initialized 256-bit integer vector. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) { @@ -2588,12 +3736,68 @@ _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) } /* Create vectors with elements in reverse order */ +/// \brief Constructs a 256-bit floating-point vector of [4 x double], +/// initialized in reverse order with the specified double-precision +/// floating-point values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> +/// instruction. +/// +/// \param __a +/// A double-precision floating-point value used to initialize bits [63:0] +/// of the result. +/// \param __b +/// A double-precision floating-point value used to initialize bits [127:64] +/// of the result. +/// \param __c +/// A double-precision floating-point value used to initialize bits [191:128] +/// of the result. +/// \param __d +/// A double-precision floating-point value used to initialize bits [255:192] +/// of the result. +/// \returns An initialized 256-bit floating-point vector of [4 x double]. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a, double __b, double __c, double __d) { return (__m256d){ __a, __b, __c, __d }; } +/// \brief Constructs a 256-bit floating-point vector of [8 x float], +/// initialized in reverse order with the specified single-precision +/// float-point values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A single-precision floating-point value used to initialize bits [31:0] +/// of the result. +/// \param __b +/// A single-precision floating-point value used to initialize bits [63:32] +/// of the result. +/// \param __c +/// A single-precision floating-point value used to initialize bits [95:64] +/// of the result. +/// \param __d +/// A single-precision floating-point value used to initialize bits [127:96] +/// of the result. +/// \param __e +/// A single-precision floating-point value used to initialize bits [159:128] +/// of the result. +/// \param __f +/// A single-precision floating-point value used to initialize bits [191:160] +/// of the result. +/// \param __g +/// A single-precision floating-point value used to initialize bits [223:192] +/// of the result. +/// \param __h +/// A single-precision floating-point value used to initialize bits [255:224] +/// of the result. +/// \returns An initialized 256-bit floating-point vector of [8 x float]. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h) @@ -2601,6 +3805,31 @@ _mm256_setr_ps(float __a, float __b, float __c, float __d, return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; } +/// \brief Constructs a 256-bit integer vector, initialized in reverse order +/// with the specified 32-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i0 +/// A 32-bit integral value used to initialize bits [31:0] of the result. +/// \param __i1 +/// A 32-bit integral value used to initialize bits [63:32] of the result. +/// \param __i2 +/// A 32-bit integral value used to initialize bits [95:64] of the result. +/// \param __i3 +/// A 32-bit integral value used to initialize bits [127:96] of the result. +/// \param __i4 +/// A 32-bit integral value used to initialize bits [159:128] of the result. +/// \param __i5 +/// A 32-bit integral value used to initialize bits [191:160] of the result. +/// \param __i6 +/// A 32-bit integral value used to initialize bits [223:192] of the result. +/// \param __i7 +/// A 32-bit integral value used to initialize bits [255:224] of the result. +/// \returns An initialized 256-bit integer vector. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7) @@ -2608,6 +3837,47 @@ _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; } +/// \brief Constructs a 256-bit integer vector, initialized in reverse order +/// with the specified 16-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w15 +/// A 16-bit integral value used to initialize bits [15:0] of the result. +/// \param __w14 +/// A 16-bit integral value used to initialize bits [31:16] of the result. +/// \param __w13 +/// A 16-bit integral value used to initialize bits [47:32] of the result. +/// \param __w12 +/// A 16-bit integral value used to initialize bits [63:48] of the result. +/// \param __w11 +/// A 16-bit integral value used to initialize bits [79:64] of the result. +/// \param __w10 +/// A 16-bit integral value used to initialize bits [95:80] of the result. +/// \param __w09 +/// A 16-bit integral value used to initialize bits [111:96] of the result. +/// \param __w08 +/// A 16-bit integral value used to initialize bits [127:112] of the result. +/// \param __w07 +/// A 16-bit integral value used to initialize bits [143:128] of the result. +/// \param __w06 +/// A 16-bit integral value used to initialize bits [159:144] of the result. +/// \param __w05 +/// A 16-bit integral value used to initialize bits [175:160] of the result. +/// \param __w04 +/// A 16-bit integral value used to initialize bits [191:176] of the result. +/// \param __w03 +/// A 16-bit integral value used to initialize bits [207:192] of the result. +/// \param __w02 +/// A 16-bit integral value used to initialize bits [223:208] of the result. +/// \param __w01 +/// A 16-bit integral value used to initialize bits [239:224] of the result. +/// \param __w00 +/// A 16-bit integral value used to initialize bits [255:240] of the result. +/// \returns An initialized 256-bit integer vector. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, @@ -2618,6 +3888,79 @@ _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; } +/// \brief Constructs a 256-bit integer vector, initialized in reverse order +/// with the specified 8-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b31 +/// An 8-bit integral value used to initialize bits [7:0] of the result. +/// \param __b30 +/// An 8-bit integral value used to initialize bits [15:8] of the result. +/// \param __b29 +/// An 8-bit integral value used to initialize bits [23:16] of the result. +/// \param __b28 +/// An 8-bit integral value used to initialize bits [31:24] of the result. +/// \param __b27 +/// An 8-bit integral value used to initialize bits [39:32] of the result. +/// \param __b26 +/// An 8-bit integral value used to initialize bits [47:40] of the result. +/// \param __b25 +/// An 8-bit integral value used to initialize bits [55:48] of the result. +/// \param __b24 +/// An 8-bit integral value used to initialize bits [63:56] of the result. +/// \param __b23 +/// An 8-bit integral value used to initialize bits [71:64] of the result. +/// \param __b22 +/// An 8-bit integral value used to initialize bits [79:72] of the result. +/// \param __b21 +/// An 8-bit integral value used to initialize bits [87:80] of the result. +/// \param __b20 +/// An 8-bit integral value used to initialize bits [95:88] of the result. +/// \param __b19 +/// An 8-bit integral value used to initialize bits [103:96] of the result. +/// \param __b18 +/// An 8-bit integral value used to initialize bits [111:104] of the result. +/// \param __b17 +/// An 8-bit integral value used to initialize bits [119:112] of the result. +/// \param __b16 +/// An 8-bit integral value used to initialize bits [127:120] of the result. +/// \param __b15 +/// An 8-bit integral value used to initialize bits [135:128] of the result. +/// \param __b14 +/// An 8-bit integral value used to initialize bits [143:136] of the result. +/// \param __b13 +/// An 8-bit integral value used to initialize bits [151:144] of the result. +/// \param __b12 +/// An 8-bit integral value used to initialize bits [159:152] of the result. +/// \param __b11 +/// An 8-bit integral value used to initialize bits [167:160] of the result. +/// \param __b10 +/// An 8-bit integral value used to initialize bits [175:168] of the result. +/// \param __b09 +/// An 8-bit integral value used to initialize bits [183:176] of the result. +/// \param __b08 +/// An 8-bit integral value used to initialize bits [191:184] of the result. +/// \param __b07 +/// An 8-bit integral value used to initialize bits [199:192] of the result. +/// \param __b06 +/// An 8-bit integral value used to initialize bits [207:200] of the result. +/// \param __b05 +/// An 8-bit integral value used to initialize bits [215:208] of the result. +/// \param __b04 +/// An 8-bit integral value used to initialize bits [223:216] of the result. +/// \param __b03 +/// An 8-bit integral value used to initialize bits [231:224] of the result. +/// \param __b02 +/// An 8-bit integral value used to initialize bits [239:232] of the result. +/// \param __b01 +/// An 8-bit integral value used to initialize bits [247:240] of the result. +/// \param __b00 +/// An 8-bit integral value used to initialize bits [255:248] of the result. +/// \returns An initialized 256-bit integer vector. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, @@ -2635,6 +3978,23 @@ _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; } +/// \brief Constructs a 256-bit integer vector, initialized in reverse order +/// with the specified 64-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> +/// instruction. +/// +/// \param __a +/// A 64-bit integral value used to initialize bits [63:0] of the result. +/// \param __b +/// A 64-bit integral value used to initialize bits [127:64] of the result. +/// \param __c +/// A 64-bit integral value used to initialize bits [191:128] of the result. +/// \param __d +/// A 64-bit integral value used to initialize bits [255:192] of the result. +/// \returns An initialized 256-bit integer vector. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) { @@ -2642,24 +4002,74 @@ _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) } /* Create vectors with repeated elements */ +/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each +/// of the four double-precision floating-point vector elements set to the +/// specified double-precision floating-point value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 256-bit floating-point vector of [4 x double]. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w) { return (__m256d){ __w, __w, __w, __w }; } +/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each +/// of the eight single-precision floating-point vector elements set to the +/// specified single-precision floating-point value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> +/// instruction. +/// +/// \param __w +/// A single-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 256-bit floating-point vector of [8 x float]. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w) { return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; } +/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the +/// 32-bit integral vector elements set to the specified 32-bit integral +/// value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> +/// instruction. +/// +/// \param __i +/// A 32-bit integral value used to initialize each vector element of the +/// result. +/// \returns An initialized 256-bit integer vector of [8 x i32]. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i) { return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; } +/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the +/// 16-bit integral vector elements set to the specified 16-bit integral +/// value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. +/// +/// \param __w +/// A 16-bit integral value used to initialize each vector element of the +/// result. +/// \returns An initialized 256-bit integer vector of [16 x i16]. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w) { @@ -2667,6 +4077,17 @@ _mm256_set1_epi16(short __w) __w, __w, __w, __w, __w, __w }; } +/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the +/// 8-bit integral vector elements set to the specified 8-bit integral value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. +/// +/// \param __b +/// An 8-bit integral value used to initialize each vector element of the +/// result. +/// \returns An initialized 256-bit integer vector of [32 x i8]. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b) { @@ -2675,6 +4096,18 @@ _mm256_set1_epi8(char __b) __b, __b, __b, __b, __b, __b, __b }; } +/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the +/// 64-bit integral vector elements set to the specified 64-bit integral +/// value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. +/// +/// \param __q +/// A 64-bit integral value used to initialize each vector element of the +/// result. +/// \returns An initialized 256-bit integer vector of [4 x i64]. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q) { @@ -2682,18 +4115,41 @@ _mm256_set1_epi64x(long long __q) } /* Create __zeroed vectors */ +/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all +/// vector elements initialized to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VXORPS </c> instruction. +/// +/// \returns A 256-bit vector of [4 x double] with all elements set to zero. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void) { return (__m256d){ 0, 0, 0, 0 }; } +/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all +/// vector elements initialized to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VXORPS </c> instruction. +/// +/// \returns A 256-bit vector of [8 x float] with all elements set to zero. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void) { return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; } +/// \brief Constructs a 256-bit integer vector initialized to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VXORPS </c> instruction. +/// +/// \returns A 256-bit integer vector initialized to zero. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void) { @@ -2701,72 +4157,210 @@ _mm256_setzero_si256(void) } /* Cast between vector types */ +/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit +/// floating-point vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. +/// \returns A 256-bit floating-point vector of [8 x float] containing the same +/// bitwise pattern as the parameter. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a) { return (__m256)__a; } +/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit +/// integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. +/// \returns A 256-bit integer vector containing the same bitwise pattern as the +/// parameter. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a) { return (__m256i)__a; } +/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit +/// floating-point vector of [4 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [8 x float]. +/// \returns A 256-bit floating-point vector of [4 x double] containing the same +/// bitwise pattern as the parameter. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a) { return (__m256d)__a; } +/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit +/// integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [8 x float]. +/// \returns A 256-bit integer vector containing the same bitwise pattern as the +/// parameter. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a) { return (__m256i)__a; } +/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector +/// of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 256-bit floating-point vector of [8 x float] containing the same +/// bitwise pattern as the parameter. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a) { return (__m256)__a; } +/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector +/// of [4 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 256-bit floating-point vector of [4 x double] containing the same +/// bitwise pattern as the parameter. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a) { return (__m256d)__a; } +/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of +/// [4 x double] as a 128-bit floating-point vector of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. +/// \returns A 128-bit floating-point vector of [2 x double] containing the +/// lower 128 bits of the parameter. static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a) { return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); } +/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of +/// [8 x float] as a 128-bit floating-point vector of [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [8 x float]. +/// \returns A 128-bit floating-point vector of [4 x float] containing the +/// lower 128 bits of the parameter. static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); } +/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 128-bit integer vector containing the lower 128 bits of the +/// parameter. static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a) { return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); } +/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a +/// 128-bit floating-point vector of [2 x double]. The lower 128 bits +/// contain the value of the source vector. The contents of the upper 128 +/// bits are undefined. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits +/// contain the value of the parameter. The contents of the upper 128 bits +/// are undefined. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a) { return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); } +/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a +/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain +/// the value of the source vector. The contents of the upper 128 bits are +/// undefined. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits +/// contain the value of the parameter. The contents of the upper 128 bits +/// are undefined. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a) { return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); } +/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. +/// The lower 128 bits contain the value of the source vector. The contents +/// of the upper 128 bits are undefined. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 256-bit integer vector. The lower 128 bits contain the value of +/// the parameter. The contents of the upper 128 bits are undefined. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a) { @@ -2778,6 +4372,38 @@ _mm256_castsi128_si256(__m128i __a) We use macros rather than inlines because we only want to accept invocations where the immediate M is a constant expression. */ +/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating +/// a 256-bit vector of [8 x float] given in the first parameter, and then +/// replacing either the upper or the lower 128 bits with the contents of a +/// 128-bit vector of [4 x float] in the second parameter. The immediate +/// integer parameter determines between the upper or the lower 128 bits. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param V1 +/// A 256-bit vector of [8 x float]. This vector is copied to the result +/// first, and then either the upper or the lower 128 bits of the result will +/// be replaced by the contents of \a V2. +/// \param V2 +/// A 128-bit vector of [4 x float]. The contents of this parameter are +/// written to either the upper or the lower 128 bits of the result depending +/// on the value of parameter \a M. +/// \param M +/// An immediate integer. The least significant bit determines how the values +/// from the two parameters are interleaved: \n +/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, +/// and bits [255:128] of \a V1 are copied to bits [255:128] of the +/// result. \n +/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the +/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the +/// result. +/// \returns A 256-bit vector of [8 x float] containing the interleaved values. #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ (__m256)__builtin_shufflevector( \ (__v8sf)(__m256)(V1), \ @@ -2791,6 +4417,38 @@ _mm256_castsi128_si256(__m128i __a) (((M) & 1) ? 10 : 6), \ (((M) & 1) ? 11 : 7) );}) +/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating +/// a 256-bit vector of [4 x double] given in the first parameter, and then +/// replacing either the upper or the lower 128 bits with the contents of a +/// 128-bit vector of [2 x double] in the second parameter. The immediate +/// integer parameter determines between the upper or the lower 128 bits. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param V1 +/// A 256-bit vector of [4 x double]. This vector is copied to the result +/// first, and then either the upper or the lower 128 bits of the result will +/// be replaced by the contents of \a V2. +/// \param V2 +/// A 128-bit vector of [2 x double]. The contents of this parameter are +/// written to either the upper or the lower 128 bits of the result depending +/// on the value of parameter \a M. +/// \param M +/// An immediate integer. The least significant bit determines how the values +/// from the two parameters are interleaved: \n +/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, +/// and bits [255:128] of \a V1 are copied to bits [255:128] of the +/// result. \n +/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the +/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the +/// result. +/// \returns A 256-bit vector of [4 x double] containing the interleaved values. #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ (__m256d)__builtin_shufflevector( \ (__v4df)(__m256d)(V1), \ @@ -2800,6 +4458,38 @@ _mm256_castsi128_si256(__m128i __a) (((M) & 1) ? 4 : 2), \ (((M) & 1) ? 5 : 3) );}) +/// \brief Constructs a new 256-bit integer vector by first duplicating a +/// 256-bit integer vector given in the first parameter, and then replacing +/// either the upper or the lower 128 bits with the contents of a 128-bit +/// integer vector in the second parameter. The immediate integer parameter +/// determines between the upper or the lower 128 bits. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param V1 +/// A 256-bit integer vector. This vector is copied to the result first, and +/// then either the upper or the lower 128 bits of the result will be +/// replaced by the contents of \a V2. +/// \param V2 +/// A 128-bit integer vector. The contents of this parameter are written to +/// either the upper or the lower 128 bits of the result depending on the +/// value of parameter \a M. +/// \param M +/// An immediate integer. The least significant bit determines how the values +/// from the two parameters are interleaved: \n +/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, +/// and bits [255:128] of \a V1 are copied to bits [255:128] of the +/// result. \n +/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the +/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the +/// result. +/// \returns A 256-bit integer vector containing the interleaved values. #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ (__m256i)__builtin_shufflevector( \ (__v4di)(__m256i)(V1), \ @@ -2814,6 +4504,27 @@ _mm256_castsi128_si256(__m128i __a) We use macros rather than inlines because we only want to accept invocations where the immediate M is a constant expression. */ +/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector +/// of [8 x float], as determined by the immediate integer parameter, and +/// returns the extracted bits as a 128-bit vector of [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm256_extractf128_ps(__m256 V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. +/// +/// \param V +/// A 256-bit vector of [8 x float]. +/// \param M +/// An immediate integer. The least significant bit determines which bits are +/// extracted from the first parameter: \n +/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the +/// result. \n +/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. +/// \returns A 128-bit vector of [4 x float] containing the extracted bits. #define _mm256_extractf128_ps(V, M) __extension__ ({ \ (__m128)__builtin_shufflevector( \ (__v8sf)(__m256)(V), \ @@ -2823,6 +4534,27 @@ _mm256_castsi128_si256(__m128i __a) (((M) & 1) ? 6 : 2), \ (((M) & 1) ? 7 : 3) );}) +/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector +/// of [4 x double], as determined by the immediate integer parameter, and +/// returns the extracted bits as a 128-bit vector of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm256_extractf128_pd(__m256d V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. +/// +/// \param V +/// A 256-bit vector of [4 x double]. +/// \param M +/// An immediate integer. The least significant bit determines which bits are +/// extracted from the first parameter: \n +/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the +/// result. \n +/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. +/// \returns A 128-bit vector of [2 x double] containing the extracted bits. #define _mm256_extractf128_pd(V, M) __extension__ ({ \ (__m128d)__builtin_shufflevector( \ (__v4df)(__m256d)(V), \ @@ -2830,6 +4562,27 @@ _mm256_castsi128_si256(__m128i __a) (((M) & 1) ? 2 : 0), \ (((M) & 1) ? 3 : 1) );}) +/// \brief Extracts either the upper or the lower 128 bits from a 256-bit +/// integer vector, as determined by the immediate integer parameter, and +/// returns the extracted bits as a 128-bit integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm256_extractf128_si256(__m256i V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. +/// +/// \param V +/// A 256-bit integer vector. +/// \param M +/// An immediate integer. The least significant bit determines which bits are +/// extracted from the first parameter: \n +/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the +/// result. \n +/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. +/// \returns A 128-bit integer vector containing the extracted bits. #define _mm256_extractf128_si256(V, M) __extension__ ({ \ (__m128i)__builtin_shufflevector( \ (__v4di)(__m256i)(V), \ @@ -2838,6 +4591,27 @@ _mm256_castsi128_si256(__m128i __a) (((M) & 1) ? 3 : 1) );}) /* SIMD load ops (unaligned) */ +/// \brief Loads two 128-bit floating-point vectors of [4 x float] from +/// unaligned memory locations and constructs a 256-bit floating-point vector +/// of [8 x float] by concatenating the two 128-bit vectors. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to load instructions followed by the +/// <c> VINSERTF128 </c> instruction. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location containing 4 consecutive +/// single-precision floating-point values. These values are to be copied to +/// bits[255:128] of the result. The address of the memory location does not +/// have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location containing 4 consecutive +/// single-precision floating-point values. These values are to be copied to +/// bits[127:0] of the result. The address of the memory location does not +/// have to be aligned. +/// \returns A 256-bit floating-point vector of [8 x float] containing the +/// concatenated result. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) { @@ -2845,6 +4619,27 @@ _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); } +/// \brief Loads two 128-bit floating-point vectors of [2 x double] from +/// unaligned memory locations and constructs a 256-bit floating-point vector +/// of [4 x double] by concatenating the two 128-bit vectors. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to load instructions followed by the +/// <c> VINSERTF128 </c> instruction. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location containing two consecutive +/// double-precision floating-point values. These values are to be copied to +/// bits[255:128] of the result. The address of the memory location does not +/// have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location containing two consecutive +/// double-precision floating-point values. These values are to be copied to +/// bits[127:0] of the result. The address of the memory location does not +/// have to be aligned. +/// \returns A 256-bit floating-point vector of [4 x double] containing the +/// concatenated result. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) { @@ -2852,6 +4647,24 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); } +/// \brief Loads two 128-bit integer vectors from unaligned memory locations and +/// constructs a 256-bit integer vector by concatenating the two 128-bit +/// vectors. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to load instructions followed by the +/// <c> VINSERTF128 </c> instruction. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location containing a 128-bit integer +/// vector. This vector is to be copied to bits[255:128] of the result. The +/// address of the memory location does not have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location containing a 128-bit integer +/// vector. This vector is to be copied to bits[127:0] of the result. The +/// address of the memory location does not have to be aligned. +/// \returns A 256-bit integer vector containing the concatenated result. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) { @@ -2860,6 +4673,24 @@ _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) } /* SIMD store ops (unaligned) */ +/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point +/// vector of [8 x float] into two different unaligned memory locations. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the +/// store instructions. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __a +/// A 256-bit floating-point vector of [8 x float]. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) { @@ -2871,6 +4702,24 @@ _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) _mm_storeu_ps(__addr_hi, __v128); } +/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point +/// vector of [4 x double] into two different unaligned memory locations. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the +/// store instructions. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) { @@ -2882,6 +4731,24 @@ _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) _mm_storeu_pd(__addr_hi, __v128); } +/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into +/// two different unaligned memory locations. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the +/// store instructions. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __a +/// A 256-bit integer vector. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) { @@ -2893,33 +4760,132 @@ _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) _mm_storeu_si128(__addr_hi, __v128); } +/// \brief Constructs a 256-bit floating-point vector of [8 x float] by +/// concatenating two 128-bit floating-point vectors of [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param __hi +/// A 128-bit floating-point vector of [4 x float] to be copied to the upper +/// 128 bits of the result. +/// \param __lo +/// A 128-bit floating-point vector of [4 x float] to be copied to the lower +/// 128 bits of the result. +/// \returns A 256-bit floating-point vector of [8 x float] containing the +/// concatenated result. static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_set_m128 (__m128 __hi, __m128 __lo) { +_mm256_set_m128 (__m128 __hi, __m128 __lo) +{ return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); } +/// \brief Constructs a 256-bit floating-point vector of [4 x double] by +/// concatenating two 128-bit floating-point vectors of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param __hi +/// A 128-bit floating-point vector of [2 x double] to be copied to the upper +/// 128 bits of the result. +/// \param __lo +/// A 128-bit floating-point vector of [2 x double] to be copied to the lower +/// 128 bits of the result. +/// \returns A 256-bit floating-point vector of [4 x double] containing the +/// concatenated result. static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_set_m128d (__m128d __hi, __m128d __lo) { +_mm256_set_m128d (__m128d __hi, __m128d __lo) +{ return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); } +/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit +/// integer vectors. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param __hi +/// A 128-bit integer vector to be copied to the upper 128 bits of the +/// result. +/// \param __lo +/// A 128-bit integer vector to be copied to the lower 128 bits of the +/// result. +/// \returns A 256-bit integer vector containing the concatenated result. static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set_m128i (__m128i __hi, __m128i __lo) { +_mm256_set_m128i (__m128i __hi, __m128i __lo) +{ return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); } +/// \brief Constructs a 256-bit floating-point vector of [8 x float] by +/// concatenating two 128-bit floating-point vectors of [4 x float]. This is +/// similar to _mm256_set_m128, but the order of the input parameters is +/// swapped. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param __lo +/// A 128-bit floating-point vector of [4 x float] to be copied to the lower +/// 128 bits of the result. +/// \param __hi +/// A 128-bit floating-point vector of [4 x float] to be copied to the upper +/// 128 bits of the result. +/// \returns A 256-bit floating-point vector of [8 x float] containing the +/// concatenated result. static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_setr_m128 (__m128 __lo, __m128 __hi) { +_mm256_setr_m128 (__m128 __lo, __m128 __hi) +{ return _mm256_set_m128(__hi, __lo); } +/// \brief Constructs a 256-bit floating-point vector of [4 x double] by +/// concatenating two 128-bit floating-point vectors of [2 x double]. This is +/// similar to _mm256_set_m128d, but the order of the input parameters is +/// swapped. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param __lo +/// A 128-bit floating-point vector of [2 x double] to be copied to the lower +/// 128 bits of the result. +/// \param __hi +/// A 128-bit floating-point vector of [2 x double] to be copied to the upper +/// 128 bits of the result. +/// \returns A 256-bit floating-point vector of [4 x double] containing the +/// concatenated result. static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_setr_m128d (__m128d __lo, __m128d __hi) { +_mm256_setr_m128d (__m128d __lo, __m128d __hi) +{ return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); } +/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit +/// integer vectors. This is similar to _mm256_set_m128i, but the order of +/// the input parameters is swapped. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. +/// +/// \param __lo +/// A 128-bit integer vector to be copied to the lower 128 bits of the +/// result. +/// \param __hi +/// A 128-bit integer vector to be copied to the upper 128 bits of the +/// result. +/// \returns A 256-bit integer vector containing the concatenated result. static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setr_m128i (__m128i __lo, __m128i __hi) { +_mm256_setr_m128i (__m128i __lo, __m128i __hi) +{ return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); } diff --git a/lib/Headers/bmiintrin.h b/lib/Headers/bmiintrin.h index 30acfaeb9f3b..488eb2dbd3d4 100644 --- a/lib/Headers/bmiintrin.h +++ b/lib/Headers/bmiintrin.h @@ -36,7 +36,7 @@ /// unsigned short _tzcnt_u16(unsigned short a); /// \endcode /// -/// This intrinsic corresponds to the \c TZCNT instruction. +/// This intrinsic corresponds to the <c> TZCNT </c> instruction. /// /// \param a /// An unsigned 16-bit integer whose trailing zeros are to be counted. @@ -53,7 +53,7 @@ /// unsigned int _andn_u32(unsigned int a, unsigned int b); /// \endcode /// -/// This intrinsic corresponds to the \c ANDN instruction. +/// This intrinsic corresponds to the <c> ANDN </c> instruction. /// /// \param a /// An unsigned integer containing one of the operands. @@ -73,7 +73,7 @@ /// unsigned int _blsi_u32(unsigned int a); /// \endcode /// -/// This intrinsic corresponds to the \c BLSI instruction. +/// This intrinsic corresponds to the <c> BLSI </c> instruction. /// /// \param a /// An unsigned integer whose bits are to be cleared. @@ -91,7 +91,7 @@ /// unsigned int _blsmsk_u32(unsigned int a); /// \endcode /// -/// This intrinsic corresponds to the \c BLSMSK instruction. +/// This intrinsic corresponds to the <c> BLSMSK </c> instruction. /// /// \param a /// An unsigned integer used to create the mask. @@ -107,7 +107,7 @@ /// unsigned int _blsr_u32(unsigned int a); /// \endcode /// -/// This intrinsic corresponds to the \c BLSR instruction. +/// This intrinsic corresponds to the <c> BLSR </c> instruction. /// /// \param a /// An unsigned integer containing the operand to be cleared. @@ -123,7 +123,7 @@ /// unsigned int _tzcnt_u32(unsigned int a); /// \endcode /// -/// This intrinsic corresponds to the \c TZCNT instruction. +/// This intrinsic corresponds to the <c> TZCNT </c> instruction. /// /// \param a /// An unsigned 32-bit integer whose trailing zeros are to be counted. @@ -143,7 +143,7 @@ /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c TZCNT instruction. +/// This intrinsic corresponds to the <c> TZCNT </c> instruction. /// /// \param __X /// An unsigned 16-bit integer whose trailing zeros are to be counted. @@ -160,7 +160,7 @@ __tzcnt_u16(unsigned short __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c ANDN instruction. +/// This intrinsic corresponds to the <c> ANDN </c> instruction. /// /// \param __X /// An unsigned integer containing one of the operands. @@ -180,7 +180,7 @@ __andn_u32(unsigned int __X, unsigned int __Y) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BEXTR instruction. +/// This intrinsic corresponds to the <c> BEXTR </c> instruction. /// /// \param __X /// An unsigned integer whose bits are to be extracted. @@ -202,7 +202,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BEXTR instruction. +/// This intrinsic corresponds to the <c> BEXTR </c> instruction. /// /// \param __X /// An unsigned integer whose bits are to be extracted. @@ -225,7 +225,7 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BLSI instruction. +/// This intrinsic corresponds to the <c> BLSI </c> instruction. /// /// \param __X /// An unsigned integer whose bits are to be cleared. @@ -243,7 +243,7 @@ __blsi_u32(unsigned int __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BLSMSK instruction. +/// This intrinsic corresponds to the <c> BLSMSK </c> instruction. /// /// \param __X /// An unsigned integer used to create the mask. @@ -259,7 +259,7 @@ __blsmsk_u32(unsigned int __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BLSR instruction. +/// This intrinsic corresponds to the <c> BLSR </c> instruction. /// /// \param __X /// An unsigned integer containing the operand to be cleared. @@ -275,7 +275,7 @@ __blsr_u32(unsigned int __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c TZCNT instruction. +/// This intrinsic corresponds to the <c> TZCNT </c> instruction. /// /// \param __X /// An unsigned 32-bit integer whose trailing zeros are to be counted. @@ -291,12 +291,12 @@ __tzcnt_u32(unsigned int __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c TZCNT instruction. +/// This intrinsic corresponds to the <c> TZCNT </c> instruction. /// /// \param __X /// An unsigned 32-bit integer whose trailing zeros are to be counted. -/// \returns An 32-bit integer containing the number of trailing zero -/// bits in the operand. +/// \returns An 32-bit integer containing the number of trailing zero bits in +/// the operand. static __inline__ int __RELAXED_FN_ATTRS _mm_tzcnt_32(unsigned int __X) { @@ -314,7 +314,7 @@ _mm_tzcnt_32(unsigned int __X) /// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b); /// \endcode /// -/// This intrinsic corresponds to the \c ANDN instruction. +/// This intrinsic corresponds to the <c> ANDN </c> instruction. /// /// \param a /// An unsigned 64-bit integer containing one of the operands. @@ -334,7 +334,7 @@ _mm_tzcnt_32(unsigned int __X) /// unsigned long long _blsi_u64(unsigned long long a); /// \endcode /// -/// This intrinsic corresponds to the \c BLSI instruction. +/// This intrinsic corresponds to the <c> BLSI </c> instruction. /// /// \param a /// An unsigned 64-bit integer whose bits are to be cleared. @@ -352,7 +352,7 @@ _mm_tzcnt_32(unsigned int __X) /// unsigned long long _blsmsk_u64(unsigned long long a); /// \endcode /// -/// This intrinsic corresponds to the \c BLSMSK instruction. +/// This intrinsic corresponds to the <c> BLSMSK </c> instruction. /// /// \param a /// An unsigned 64-bit integer used to create the mask. @@ -368,7 +368,7 @@ _mm_tzcnt_32(unsigned int __X) /// unsigned long long _blsr_u64(unsigned long long a); /// \endcode /// -/// This intrinsic corresponds to the \c BLSR instruction. +/// This intrinsic corresponds to the <c> BLSR </c> instruction. /// /// \param a /// An unsigned 64-bit integer containing the operand to be cleared. @@ -384,7 +384,7 @@ _mm_tzcnt_32(unsigned int __X) /// unsigned long long _tzcnt_u64(unsigned long long a); /// \endcode /// -/// This intrinsic corresponds to the \c TZCNT instruction. +/// This intrinsic corresponds to the <c> TZCNT </c> instruction. /// /// \param a /// An unsigned 64-bit integer whose trailing zeros are to be counted. @@ -397,7 +397,7 @@ _mm_tzcnt_32(unsigned int __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c ANDN instruction. +/// This intrinsic corresponds to the <c> ANDN </c> instruction. /// /// \param __X /// An unsigned 64-bit integer containing one of the operands. @@ -417,7 +417,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BEXTR instruction. +/// This intrinsic corresponds to the <c> BEXTR </c> instruction. /// /// \param __X /// An unsigned 64-bit integer whose bits are to be extracted. @@ -439,7 +439,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BEXTR instruction. +/// This intrinsic corresponds to the <c> BEXTR </c> instruction. /// /// \param __X /// An unsigned 64-bit integer whose bits are to be extracted. @@ -462,7 +462,7 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BLSI instruction. +/// This intrinsic corresponds to the <c> BLSI </c> instruction. /// /// \param __X /// An unsigned 64-bit integer whose bits are to be cleared. @@ -480,7 +480,7 @@ __blsi_u64(unsigned long long __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BLSMSK instruction. +/// This intrinsic corresponds to the <c> BLSMSK </c> instruction. /// /// \param __X /// An unsigned 64-bit integer used to create the mask. @@ -496,7 +496,7 @@ __blsmsk_u64(unsigned long long __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c BLSR instruction. +/// This intrinsic corresponds to the <c> BLSR </c> instruction. /// /// \param __X /// An unsigned 64-bit integer containing the operand to be cleared. @@ -512,7 +512,7 @@ __blsr_u64(unsigned long long __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c TZCNT instruction. +/// This intrinsic corresponds to the <c> TZCNT </c> instruction. /// /// \param __X /// An unsigned 64-bit integer whose trailing zeros are to be counted. @@ -528,12 +528,12 @@ __tzcnt_u64(unsigned long long __X) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c TZCNT instruction. +/// This intrinsic corresponds to the <c> TZCNT </c> instruction. /// /// \param __X /// An unsigned 64-bit integer whose trailing zeros are to be counted. -/// \returns An 64-bit integer containing the number of trailing zero -/// bits in the operand. +/// \returns An 64-bit integer containing the number of trailing zero bits in +/// the operand. static __inline__ long long __RELAXED_FN_ATTRS _mm_tzcnt_64(unsigned long long __X) { diff --git a/lib/Headers/cuda_wrappers/algorithm b/lib/Headers/cuda_wrappers/algorithm new file mode 100644 index 000000000000..95d9beb73c68 --- /dev/null +++ b/lib/Headers/cuda_wrappers/algorithm @@ -0,0 +1,96 @@ +/*===---- complex - CUDA wrapper for <algorithm> ----------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM +#define __CLANG_CUDA_WRAPPERS_ALGORITHM + +// This header defines __device__ overloads of std::min/max, but only if we're +// <= C++11. In C++14, these functions are constexpr, and so are implicitly +// __host__ __device__. +// +// We don't support the initializer_list overloads because +// initializer_list::begin() and end() are not __host__ __device__ functions. +// +// When compiling in C++14 mode, we could force std::min/max to have different +// implementations for host and device, by declaring the device overloads +// before the constexpr overloads appear. We choose not to do this because + +// a) why write our own implementation when we can use one from the standard +// library? and +// b) libstdc++ is evil and declares min/max inside a header that is included +// *before* we include <algorithm>. So we'd have to unconditionally +// declare our __device__ overloads of min/max, but that would pollute +// things for people who choose not to include <algorithm>. + +#include_next <algorithm> + +#if __cplusplus <= 201103L + +// We need to define these overloads in exactly the namespace our standard +// library uses (including the right inline namespace), otherwise they won't be +// picked up by other functions in the standard library (e.g. functions in +// <complex>). Thus the ugliness below. +#ifdef _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_NAMESPACE_STD +#else +namespace std { +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_BEGIN_NAMESPACE_VERSION +#endif +#endif + +template <class __T, class __Cmp> +inline __device__ const __T & +max(const __T &__a, const __T &__b, __Cmp __cmp) { + return __cmp(__a, __b) ? __b : __a; +} + +template <class __T> +inline __device__ const __T & +max(const __T &__a, const __T &__b) { + return __a < __b ? __b : __a; +} + +template <class __T, class __Cmp> +inline __device__ const __T & +min(const __T &__a, const __T &__b, __Cmp __cmp) { + return __cmp(__b, __a) ? __b : __a; +} + +template <class __T> +inline __device__ const __T & +min(const __T &__a, const __T &__b) { + return __a < __b ? __b : __a; +} + +#ifdef _LIBCPP_END_NAMESPACE_STD +_LIBCPP_END_NAMESPACE_STD +#else +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_END_NAMESPACE_VERSION +#endif +} // namespace std +#endif + +#endif // __cplusplus <= 201103L +#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM diff --git a/lib/Headers/cuda_wrappers/complex b/lib/Headers/cuda_wrappers/complex new file mode 100644 index 000000000000..11d40a82a8f6 --- /dev/null +++ b/lib/Headers/cuda_wrappers/complex @@ -0,0 +1,82 @@ +/*===---- complex - CUDA wrapper for <complex> ------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX +#define __CLANG_CUDA_WRAPPERS_COMPLEX + +// Wrapper around <complex> that forces its functions to be __host__ +// __device__. + +// First, include host-only headers we think are likely to be included by +// <complex>, so that the pragma below only applies to <complex> itself. +#if __cplusplus >= 201103L +#include <type_traits> +#endif +#include <stdexcept> +#include <cmath> +#include <sstream> + +// Next, include our <algorithm> wrapper, to ensure that device overloads of +// std::min/max are available. +#include <algorithm> + +#pragma clang force_cuda_host_device begin + +// When compiling for device, ask libstdc++ to use its own implements of +// complex functions, rather than calling builtins (which resolve to library +// functions that don't exist when compiling CUDA device code). +// +// This is a little dicey, because it causes libstdc++ to define a different +// set of overloads on host and device. +// +// // Present only when compiling for host. +// __host__ __device__ void complex<float> sin(const complex<float>& x) { +// return __builtin_csinf(x); +// } +// +// // Present when compiling for host and for device. +// template <typename T> +// void __host__ __device__ complex<T> sin(const complex<T>& x) { +// return complex<T>(sin(x.real()) * cosh(x.imag()), +// cos(x.real()), sinh(x.imag())); +// } +// +// This is safe because when compiling for device, all function calls in +// __host__ code to sin() will still resolve to *something*, even if they don't +// resolve to the same function as they resolve to when compiling for host. We +// don't care that they don't resolve to the right function because we won't +// codegen this host code when compiling for device. + +#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX") +#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1") +#define _GLIBCXX_USE_C99_COMPLEX 0 +#define _GLIBCXX_USE_C99_COMPLEX_TR1 0 + +#include_next <complex> + +#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1") +#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX") + +#pragma clang force_cuda_host_device end + +#endif // include guard diff --git a/lib/Headers/cuda_wrappers/new b/lib/Headers/cuda_wrappers/new new file mode 100644 index 000000000000..b77131af0e5b --- /dev/null +++ b/lib/Headers/cuda_wrappers/new @@ -0,0 +1,47 @@ +/*===---- complex - CUDA wrapper for <new> ------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_WRAPPERS_NEW +#define __CLANG_CUDA_WRAPPERS_NEW + +#include_next <new> + +// Device overrides for placement new and delete. +#pragma push_macro("CUDA_NOEXCEPT") +#if __cplusplus >= 201103L +#define CUDA_NOEXCEPT noexcept +#else +#define CUDA_NOEXCEPT +#endif + +__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT { + return __ptr; +} +__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT { + return __ptr; +} +__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {} +__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {} +#pragma pop_macro("CUDA_NOEXCEPT") + +#endif // include guard diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h index 70d6d726110a..1512f9f0b47b 100644 --- a/lib/Headers/emmintrin.h +++ b/lib/Headers/emmintrin.h @@ -49,6 +49,21 @@ typedef signed char __v16qs __attribute__((__vector_size__(16))); /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) +/// \brief Adds lower double-precision values in both operands and returns the +/// sum in the lower 64 bits of the result. The upper 64 bits of the result +/// are copied from the upper double-precision value of the first operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// sum of the lower 64 bits of both operands. The upper 64 bits are copied +/// from the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b) { @@ -56,12 +71,41 @@ _mm_add_sd(__m128d __a, __m128d __b) return __a; } +/// \brief Adds two 128-bit vectors of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] containing the sums of both +/// operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b) { return (__m128d)((__v2df)__a + (__v2df)__b); } +/// \brief Subtracts the lower double-precision value of the second operand +/// from the lower double-precision value of the first operand and returns +/// the difference in the lower 64 bits of the result. The upper 64 bits of +/// the result are copied from the upper double-precision value of the first +/// operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the minuend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the subtrahend. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// difference of the lower 64 bits of both operands. The upper 64 bits are +/// copied from the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b) { @@ -69,12 +113,40 @@ _mm_sub_sd(__m128d __a, __m128d __b) return __a; } +/// \brief Subtracts two 128-bit vectors of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the minuend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the subtrahend. +/// \returns A 128-bit vector of [2 x double] containing the differences between +/// both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b) { return (__m128d)((__v2df)__a - (__v2df)__b); } +/// \brief Multiplies lower double-precision values in both operands and returns +/// the product in the lower 64 bits of the result. The upper 64 bits of the +/// result are copied from the upper double-precision value of the first +/// operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// product of the lower 64 bits of both operands. The upper 64 bits are +/// copied from the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b) { @@ -82,12 +154,41 @@ _mm_mul_sd(__m128d __a, __m128d __b) return __a; } +/// \brief Multiplies two 128-bit vectors of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \returns A 128-bit vector of [2 x double] containing the products of both +/// operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b) { return (__m128d)((__v2df)__a * (__v2df)__b); } +/// \brief Divides the lower double-precision value of the first operand by the +/// lower double-precision value of the second operand and returns the +/// quotient in the lower 64 bits of the result. The upper 64 bits of the +/// result are copied from the upper double-precision value of the first +/// operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the dividend. +/// \param __b +/// A 128-bit vector of [2 x double] containing divisor. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// quotient of the lower 64 bits of both operands. The upper 64 bits are +/// copied from the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b) { @@ -95,12 +196,44 @@ _mm_div_sd(__m128d __a, __m128d __b) return __a; } +/// \brief Performs an element-by-element division of two 128-bit vectors of +/// [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the dividend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the divisor. +/// \returns A 128-bit vector of [2 x double] containing the quotients of both +/// operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b) { return (__m128d)((__v2df)__a / (__v2df)__b); } +/// \brief Calculates the square root of the lower double-precision value of +/// the second operand and returns it in the lower 64 bits of the result. +/// The upper 64 bits of the result are copied from the upper double- +/// precision value of the first operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// upper 64 bits of this operand are copied to the upper 64 bits of the +/// result. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// square root is calculated using the lower 64 bits of this operand. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// square root of the lower 64 bits of operand \a __b, and whose upper 64 +/// bits are copied from the upper 64 bits of operand \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b) { @@ -108,150 +241,518 @@ _mm_sqrt_sd(__m128d __a, __m128d __b) return (__m128d) { __c[0], __a[1] }; } +/// \brief Calculates the square root of the each of two values stored in a +/// 128-bit vector of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [2 x double] containing the square roots of the +/// values in the operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) { return __builtin_ia32_sqrtpd((__v2df)__a); } +/// \brief Compares lower 64-bit double-precision values of both operands, and +/// returns the lesser of the pair of values in the lower 64-bits of the +/// result. The upper 64 bits of the result are copied from the upper double- +/// precision value of the first operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// lower 64 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// lower 64 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// minimum value between both operands. The upper 64 bits are copied from +/// the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b) { return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); } +/// \brief Performs element-by-element comparison of the two 128-bit vectors of +/// [2 x double] and returns the vector containing the lesser of each pair of +/// values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \returns A 128-bit vector of [2 x double] containing the minimum values +/// between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b) { return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); } +/// \brief Compares lower 64-bits double-precision values of both operands, and +/// returns the greater of the pair of values in the lower 64-bits of the +/// result. The upper 64 bits of the result are copied from the upper double- +/// precision value of the first operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// lower 64 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// lower 64 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// maximum value between both operands. The upper 64 bits are copied from +/// the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b) { return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); } +/// \brief Performs element-by-element comparison of the two 128-bit vectors of +/// [2 x double] and returns the vector containing the greater of each pair +/// of values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \returns A 128-bit vector of [2 x double] containing the maximum values +/// between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b) { return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); } +/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the +/// values between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b) { - return (__m128d)((__v4su)__a & (__v4su)__b); + return (__m128d)((__v2du)__a & (__v2du)__b); } +/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using +/// the one's complement of the values contained in the first source operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the left source operand. The +/// one's complement of this value is used in the bitwise AND. +/// \param __b +/// A 128-bit vector of [2 x double] containing the right source operand. +/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the +/// values in the second operand and the one's complement of the first +/// operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b) { - return (__m128d)(~(__v4su)__a & (__v4su)__b); + return (__m128d)(~(__v2du)__a & (__v2du)__b); } +/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the +/// values between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b) { - return (__m128d)((__v4su)__a | (__v4su)__b); + return (__m128d)((__v2du)__a | (__v2du)__b); } +/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the +/// values between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b) { - return (__m128d)((__v4su)__a ^ (__v4su)__b); + return (__m128d)((__v2du)__a ^ (__v2du)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0h +/// for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are less than those in the second operand. Each comparison +/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are less than or equal to those in the second operand. Each +/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are greater than those in the second operand. Each comparison +/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are greater than or equal to those in the second operand. Each +/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are ordered with respect to those in the second operand. A pair +/// of double-precision values are "ordered" with respect to each other if +/// neither value is a NaN. Each comparison yields 0h for false, +/// FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are unordered with respect to those in the second operand. A pair +/// of double-precision values are "unordered" with respect to each other if +/// one or both values are NaN. Each comparison yields 0h for false, +/// FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are unequal to those in the second operand. Each comparison +/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are not less than those in the second operand. Each comparison +/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are not less than or equal to those in the second operand. Each +/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are not greater than those in the second operand. Each +/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); } +/// \brief Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are not greater than or equal to those in the second operand. +/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] for equality. The +/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than the corresponding value in +/// the second parameter. The comparison yields 0h for false, +/// FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than or equal to the +/// corresponding value in the second parameter. The comparison yields 0h for +/// false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than the corresponding value +/// in the second parameter. The comparison yields 0h for false, +/// FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b) { @@ -259,6 +760,24 @@ _mm_cmpgt_sd(__m128d __a, __m128d __b) return (__m128d) { __c[0], __a[1] }; } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than or equal to the +/// corresponding value in the second parameter. The comparison yields 0h for +/// false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b) { @@ -266,36 +785,147 @@ _mm_cmpge_sd(__m128d __a, __m128d __b) return (__m128d) { __c[0], __a[1] }; } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is "ordered" with respect to the +/// corresponding value in the second parameter. The comparison yields 0h for +/// false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are +/// "ordered" with respect to each other if neither value is a NaN. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is "unordered" with respect to the +/// corresponding value in the second parameter. The comparison yields 0h +/// for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values +/// are "unordered" with respect to each other if one or both values are NaN. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is unequal to the corresponding value in +/// the second parameter. The comparison yields 0h for false, +/// FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is not less than the corresponding +/// value in the second parameter. The comparison yields 0h for false, +/// FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is not less than or equal to the +/// corresponding value in the second parameter. The comparison yields 0h +/// for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is not greater than the corresponding +/// value in the second parameter. The comparison yields 0h for false, +/// FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b) { @@ -303,6 +933,24 @@ _mm_cmpngt_sd(__m128d __a, __m128d __b) return (__m128d) { __c[0], __a[1] }; } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is not greater than or equal to the +/// corresponding value in the second parameter. The comparison yields 0h +/// for false, FFFFFFFFFFFFFFFFh for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b) { @@ -310,84 +958,317 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b) return (__m128d) { __c[0], __a[1] }; } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] for equality. The +/// comparison yields 0 for false, 1 for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than the corresponding value in +/// the second parameter. The comparison yields 0 for false, 1 for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than or equal to the +/// corresponding value in the second parameter. The comparison yields 0 for +/// false, 1 for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than the corresponding value +/// in the second parameter. The comparison yields 0 for false, 1 for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than or equal to the +/// corresponding value in the second parameter. The comparison yields 0 for +/// false, 1 for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is unequal to the corresponding value in +/// the second parameter. The comparison yields 0 for false, 1 for true. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] for equality. The +/// comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 1 is returned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than the corresponding value in +/// the second parameter. The comparison yields 0 for false, 1 for true. If +/// either of the two lower double-precision values is NaN, 1 is returned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than or equal to the +/// corresponding value in the second parameter. The comparison yields 0 for +/// false, 1 for true. If either of the two lower double-precision values is +/// NaN, 1 is returned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than the corresponding value +/// in the second parameter. The comparison yields 0 for false, 1 for true. +/// If either of the two lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than or equal to the +/// corresponding value in the second parameter. The comparison yields 0 for +/// false, 1 for true. If either of the two lower double-precision values +/// is NaN, 0 is returned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); } +/// \brief Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is unequal to the corresponding value in +/// the second parameter. The comparison yields 0 for false, 1 for true. If +/// either of the two lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison result. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); } +/// \brief Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two single-precision floating-point +/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. +/// The upper 64 bits of the result vector are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the +/// converted values. The upper 64 bits are set to zero. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) { return __builtin_ia32_cvtpd2ps((__v2df)__a); } +/// \brief Converts the lower two single-precision floating-point elements of a +/// 128-bit vector of [4 x float] into two double-precision floating-point +/// values, returned in a 128-bit vector of [2 x double]. The upper two +/// elements of the input vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower two single-precision +/// floating-point elements are converted to double-precision values. The +/// upper two elements are unused. +/// \returns A 128-bit vector of [2 x double] containing the converted values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) { @@ -395,6 +1276,19 @@ _mm_cvtps_pd(__m128 __a) __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); } +/// \brief Converts the lower two integer elements of a 128-bit vector of +/// [4 x i32] into two double-precision floating-point values, returned in a +/// 128-bit vector of [2 x double]. The upper two elements of the input +/// vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. +/// +/// \param __a +/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are +/// converted to double-precision values. The upper two elements are unused. +/// \returns A 128-bit vector of [2 x double] containing the converted values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) { @@ -402,24 +1296,84 @@ _mm_cvtepi32_pd(__m128i __a) __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); } +/// \brief Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two signed 32-bit integer values, +/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper +/// 64 bits of the result vector are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the +/// converted values. The upper 64 bits are set to zero. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) { return __builtin_ia32_cvtpd2dq((__v2df)__a); } +/// \brief Converts the low-order element of a 128-bit vector of [2 x double] +/// into a 32-bit signed integer value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the +/// conversion. +/// \returns A 32-bit signed integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) { return __builtin_ia32_cvtsd2si((__v2df)__a); } +/// \brief Converts the lower double-precision floating-point element of a +/// 128-bit vector of [2 x double], in the second parameter, into a +/// single-precision floating-point value, returned in the lower 32 bits of a +/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are +/// copied from the upper 96 bits of the first parameter. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are +/// copied to the upper 96 bits of the result. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision +/// floating-point element is used in the conversion. +/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the +/// converted value from the second parameter. The upper 96 bits are copied +/// from the upper 96 bits of the first parameter. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b) { return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); } +/// \brief Converts a 32-bit signed integer value, in the second parameter, into +/// a double-precision floating-point value, returned in the lower 64 bits of +/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector +/// are copied from the upper 64 bits of the first parameter. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are +/// copied to the upper 64 bits of the result. +/// \param __b +/// A 32-bit signed integer containing the value to be converted. +/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the +/// converted value from the second parameter. The upper 64 bits are copied +/// from the upper 64 bits of the first parameter. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b) { @@ -427,6 +1381,25 @@ _mm_cvtsi32_sd(__m128d __a, int __b) return __a; } +/// \brief Converts the lower single-precision floating-point element of a +/// 128-bit vector of [4 x float], in the second parameter, into a +/// double-precision floating-point value, returned in the lower 64 bits of +/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector +/// are copied from the upper 64 bits of the first parameter. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are +/// copied to the upper 64 bits of the result. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower single-precision +/// floating-point element is used in the conversion. +/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the +/// converted value from the second parameter. The upper 64 bits are copied +/// from the upper 64 bits of the first parameter. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b) { @@ -434,48 +1407,145 @@ _mm_cvtss_sd(__m128d __a, __m128 __b) return __a; } +/// \brief Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two signed 32-bit integer values, +/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the +/// result of either conversion is inexact, the result is truncated (rounded +/// towards zero) regardless of the current MXCSR setting. The upper 64 bits +/// of the result vector are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the +/// converted values. The upper 64 bits are set to zero. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) { return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); } +/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit +/// signed integer value, truncating the result when it is inexact. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the +/// conversion. +/// \returns A 32-bit signed integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) { return __builtin_ia32_cvttsd2si((__v2df)__a); } +/// \brief Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two signed 32-bit integer values, +/// returned in a 64-bit vector of [2 x i32]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 64-bit vector of [2 x i32] containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) { return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); } +/// \brief Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two signed 32-bit integer values, +/// returned in a 64-bit vector of [2 x i32]. If the result of either +/// conversion is inexact, the result is truncated (rounded towards zero) +/// regardless of the current MXCSR setting. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 64-bit vector of [2 x i32] containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) { return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); } +/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of +/// [2 x i32] into two double-precision floating-point values, returned in a +/// 128-bit vector of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32]. +/// \returns A 128-bit vector of [2 x double] containing the converted values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) { return __builtin_ia32_cvtpi2pd((__v2si)__a); } +/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as +/// a double-precision floating-point value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are returned. +/// \returns A double-precision floating-point value copied from the lower 64 +/// bits of \a __a. static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) { return __a[0]; } +/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned +/// memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. +/// +/// \param __dp +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 16-byte aligned. +/// \returns A 128-bit vector of [2 x double] containing the loaded values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) { return *(__m128d*)__dp; } +/// \brief Loads a double-precision floating-point value from a specified memory +/// location and duplicates it to both vector elements of a 128-bit vector of +/// [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. +/// +/// \param __dp +/// A pointer to a memory location containing a double-precision value. +/// \returns A 128-bit vector of [2 x double] containing the loaded and +/// duplicated values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) { @@ -488,6 +1558,20 @@ _mm_load1_pd(double const *__dp) #define _mm_load_pd1(dp) _mm_load1_pd(dp) +/// \brief Loads two double-precision values, in reverse order, from an aligned +/// memory location into a 128-bit vector of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + +/// needed shuffling instructions. In AVX mode, the shuffling may be combined +/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. +/// +/// \param __dp +/// A 16-byte aligned pointer to an array of double-precision values to be +/// loaded in reverse order. +/// \returns A 128-bit vector of [2 x double] containing the reversed loaded +/// values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) { @@ -495,6 +1579,17 @@ _mm_loadr_pd(double const *__dp) return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); } +/// \brief Loads a 128-bit floating-point vector of [2 x double] from an +/// unaligned memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. +/// +/// \param __dp +/// A pointer to a 128-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [2 x double] containing the loaded values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) { @@ -524,6 +1619,23 @@ _mm_load_sd(double const *__dp) return (__m128d){ __u, 0 }; } +/// \brief Loads a double-precision value into the high-order bits of a 128-bit +/// vector of [2 x double]. The low-order bits are copied from the low-order +/// bits of the first operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. \n +/// Bits [63:0] are written to bits [63:0] of the result. +/// \param __dp +/// A pointer to a 64-bit memory location containing a double-precision +/// floating-point value that is loaded. The loaded value is written to bits +/// [127:64] of the result. The address of the memory location does not have +/// to be aligned. +/// \returns A 128-bit vector of [2 x double] containing the moved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp) { @@ -534,6 +1646,23 @@ _mm_loadh_pd(__m128d __a, double const *__dp) return (__m128d){ __a[0], __u }; } +/// \brief Loads a double-precision value into the low-order bits of a 128-bit +/// vector of [2 x double]. The high-order bits are copied from the +/// high-order bits of the first operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. \n +/// Bits [127:64] are written to bits [127:64] of the result. +/// \param __dp +/// A pointer to a 64-bit memory location containing a double-precision +/// floating-point value that is loaded. The loaded value is written to bits +/// [63:0] of the result. The address of the memory location does not have to +/// be aligned. +/// \returns A 128-bit vector of [2 x double] containing the moved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp) { @@ -544,48 +1673,149 @@ _mm_loadl_pd(__m128d __a, double const *__dp) return (__m128d){ __u, __a[1] }; } +/// \brief Constructs a 128-bit floating-point vector of [2 x double] with +/// unspecified content. This could be used as an argument to another +/// intrinsic function where the argument is required but the value is not +/// actually used. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 128-bit floating-point vector of [2 x double] with unspecified +/// content. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) { return (__m128d)__builtin_ia32_undef128(); } +/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower +/// 64 bits of the vector are initialized with the specified double-precision +/// floating-point value. The upper 64 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize the lower 64 +/// bits of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. The +/// lower 64 bits contain the value of the parameter. The upper 64 bits are +/// set to zero. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) { return (__m128d){ __w, 0 }; } +/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each +/// of the two double-precision floating-point vector elements set to the +/// specified double-precision floating-point value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) { return (__m128d){ __w, __w }; } +/// \brief Constructs a 128-bit floating-point vector of [2 x double] +/// initialized with the specified double-precision floating-point values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize the upper 64 +/// bits of the result. +/// \param __x +/// A double-precision floating-point value used to initialize the lower 64 +/// bits of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x) { return (__m128d){ __x, __w }; } +/// \brief Constructs a 128-bit floating-point vector of [2 x double], +/// initialized in reverse order with the specified double-precision +/// floating-point values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize the lower 64 +/// bits of the result. +/// \param __x +/// A double-precision floating-point value used to initialize the upper 64 +/// bits of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x) { return (__m128d){ __w, __x }; } +/// \brief Constructs a 128-bit floating-point vector of [2 x double] +/// initialized to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. +/// +/// \returns An initialized 128-bit floating-point vector of [2 x double] with +/// all elements set to zero. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) { return (__m128d){ 0, 0 }; } +/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower +/// 64 bits are set to the lower 64 bits of the second parameter. The upper +/// 64 bits are set to the upper 64 bits of the first parameter. +// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the +/// upper 64 bits of the result. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the +/// lower 64 bits of the result. +/// \returns A 128-bit vector of [2 x double] containing the moved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b) { return (__m128d){ __b[0], __a[1] }; } +/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a +/// memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. +/// +/// \param __dp +/// A pointer to a 64-bit memory location. +/// \param __a +/// A 128-bit vector of [2 x double] containing the value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a) { @@ -608,12 +1838,36 @@ _mm_store1_pd(double *__dp, __m128d __a) _mm_store_pd(__dp, __a); } +/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory +/// location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. +/// +/// \param __dp +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 16-byte aligned. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a) { return _mm_store1_pd(__dp, __a); } +/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory +/// location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. +/// +/// \param __dp +/// A pointer to a 128-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a) { @@ -623,6 +1877,20 @@ _mm_storeu_pd(double *__dp, __m128d __a) ((struct __storeu_pd*)__dp)->__v = __a; } +/// \brief Stores two double-precision values, in reverse order, from a 128-bit +/// vector of [2 x double] to a 16-byte aligned memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to a shuffling instruction followed by a +/// <c> VMOVAPD / MOVAPD </c> instruction. +/// +/// \param __dp +/// A pointer to a 16-byte aligned memory location that can store two +/// double-precision values. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be reversed and +/// stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a) { @@ -630,6 +1898,17 @@ _mm_storer_pd(double *__dp, __m128d __a) *(__m128d *)__dp = __a; } +/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a +/// memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. +/// +/// \param __dp +/// A pointer to a 64-bit memory location. +/// \param __a +/// A 128-bit vector of [2 x double] containing the value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a) { @@ -639,6 +1918,17 @@ _mm_storeh_pd(double *__dp, __m128d __a) ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; } +/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a +/// memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. +/// +/// \param __dp +/// A pointer to a 64-bit memory location. +/// \param __a +/// A 128-bit vector of [2 x double] containing the value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a) { @@ -648,127 +1938,391 @@ _mm_storel_pd(double *__dp, __m128d __a) ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; } +/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8], +/// saving the lower 8 bits of each sum in the corresponding element of a +/// 128-bit result vector of [16 x i8]. The integer elements of both +/// parameters can be either signed or unsigned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. +/// \param __b +/// A 128-bit vector of [16 x i8]. +/// \returns A 128-bit vector of [16 x i8] containing the sums of both +/// parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b) { return (__m128i)((__v16qu)__a + (__v16qu)__b); } +/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16], +/// saving the lower 16 bits of each sum in the corresponding element of a +/// 128-bit result vector of [8 x i16]. The integer elements of both +/// parameters can be either signed or unsigned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// \param __b +/// A 128-bit vector of [8 x i16]. +/// \returns A 128-bit vector of [8 x i16] containing the sums of both +/// parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hu)__a + (__v8hu)__b); } +/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32], +/// saving the lower 32 bits of each sum in the corresponding element of a +/// 128-bit result vector of [4 x i32]. The integer elements of both +/// parameters can be either signed or unsigned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. +/// \param __b +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit vector of [4 x i32] containing the sums of both +/// parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4su)__a + (__v4su)__b); } +/// \brief Adds two signed or unsigned 64-bit integer values, returning the +/// lower 64 bits of the sum. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> PADDQ </c> instruction. +/// +/// \param __a +/// A 64-bit integer. +/// \param __b +/// A 64-bit integer. +/// \returns A 64-bit integer containing the sum of both parameters. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); } +/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64], +/// saving the lower 64 bits of each sum in the corresponding element of a +/// 128-bit result vector of [2 x i64]. The integer elements of both +/// parameters can be either signed or unsigned. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. +/// \param __b +/// A 128-bit vector of [2 x i64]. +/// \returns A 128-bit vector of [2 x i64] containing the sums of both +/// parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b) { return (__m128i)((__v2du)__a + (__v2du)__b); } +/// \brief Adds, with saturation, the corresponding elements of two 128-bit +/// signed [16 x i8] vectors, saving each sum in the corresponding element of +/// a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are +/// saturated to 7Fh. Negative sums less than 80h are saturated to 80h. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. +/// +/// \param __a +/// A 128-bit signed [16 x i8] vector. +/// \param __b +/// A 128-bit signed [16 x i8] vector. +/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of +/// both parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Adds, with saturation, the corresponding elements of two 128-bit +/// signed [8 x i16] vectors, saving each sum in the corresponding element of +/// a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh +/// are saturated to 7FFFh. Negative sums less than 8000h are saturated to +/// 8000h. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of +/// both parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Adds, with saturation, the corresponding elements of two 128-bit +/// unsigned [16 x i8] vectors, saving each sum in the corresponding element +/// of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh +/// are saturated to FFh. Negative sums are saturated to 00h. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. +/// +/// \param __a +/// A 128-bit unsigned [16 x i8] vector. +/// \param __b +/// A 128-bit unsigned [16 x i8] vector. +/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums +/// of both parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Adds, with saturation, the corresponding elements of two 128-bit +/// unsigned [8 x i16] vectors, saving each sum in the corresponding element +/// of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh +/// are saturated to FFFFh. Negative sums are saturated to 0000h. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. +/// +/// \param __a +/// A 128-bit unsigned [8 x i16] vector. +/// \param __b +/// A 128-bit unsigned [8 x i16] vector. +/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums +/// of both parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Computes the rounded avarages of corresponding elements of two +/// 128-bit unsigned [16 x i8] vectors, saving each result in the +/// corresponding element of a 128-bit result vector of [16 x i8]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. +/// +/// \param __a +/// A 128-bit unsigned [16 x i8] vector. +/// \param __b +/// A 128-bit unsigned [16 x i8] vector. +/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded +/// averages of both parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Computes the rounded avarages of corresponding elements of two +/// 128-bit unsigned [8 x i16] vectors, saving each result in the +/// corresponding element of a 128-bit result vector of [8 x i16]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. +/// +/// \param __a +/// A 128-bit unsigned [8 x i16] vector. +/// \param __b +/// A 128-bit unsigned [8 x i16] vector. +/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded +/// averages of both parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] +/// vectors, producing eight intermediate 32-bit signed integer products, and +/// adds the consecutive pairs of 32-bit products to form a 128-bit signed +/// [4 x i32] vector. For example, bits [15:0] of both parameters are +/// multiplied producing a 32-bit product, bits [31:16] of both parameters +/// are multiplied producing a 32-bit product, and the sum of those two +/// products becomes bits [31:0] of the result. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [4 x i32] vector containing the sums of products +/// of both parameters. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); } +/// \brief Compares corresponding elements of two 128-bit signed [8 x i16] +/// vectors, saving the greater value from each comparison in the +/// corresponding element of a 128-bit result vector of [8 x i16]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the greater value of +/// each comparison. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] +/// vectors, saving the greater value from each comparison in the +/// corresponding element of a 128-bit result vector of [16 x i8]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. +/// +/// \param __a +/// A 128-bit unsigned [16 x i8] vector. +/// \param __b +/// A 128-bit unsigned [16 x i8] vector. +/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of +/// each comparison. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); } +/// \brief Compares corresponding elements of two 128-bit signed [8 x i16] +/// vectors, saving the smaller value from each comparison in the +/// corresponding element of a 128-bit result vector of [8 x i16]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of +/// each comparison. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] +/// vectors, saving the smaller value from each comparison in the +/// corresponding element of a 128-bit result vector of [16 x i8]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. +/// +/// \param __a +/// A 128-bit unsigned [16 x i8] vector. +/// \param __b +/// A 128-bit unsigned [16 x i8] vector. +/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of +/// each comparison. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); } +/// \brief Multiplies the corresponding elements of two signed [8 x i16] +/// vectors, saving the upper 16 bits of each 32-bit product in the +/// corresponding element of a 128-bit signed [8 x i16] result vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of +/// each of the eight 32-bit products. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Multiplies the corresponding elements of two unsigned [8 x i16] +/// vectors, saving the upper 16 bits of each 32-bit product in the +/// corresponding element of a 128-bit unsigned [8 x i16] result vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. +/// +/// \param __a +/// A 128-bit unsigned [8 x i16] vector. +/// \param __b +/// A 128-bit unsigned [8 x i16] vector. +/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits +/// of each of the eight 32-bit products. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Multiplies the corresponding elements of two [8 x short] vectors and -/// returns a vector containing the low-order 16 bits of each 32-bit product -/// in the corresponding element. +/// \brief Multiplies the corresponding elements of two signed [8 x i16] +/// vectors, saving the lower 16 bits of each 32-bit product in the +/// corresponding element of a 128-bit signed [8 x i16] result vector. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction. +/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. /// /// \param __a -/// A 128-bit integer vector containing one of the source operands. +/// A 128-bit signed [8 x i16] vector. /// \param __b -/// A 128-bit integer vector containing one of the source operands. -/// \returns A 128-bit integer vector containing the products of both operands. +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of +/// each of the eight 32-bit products. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b) { @@ -781,7 +2335,7 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMULUDQ instruction. +/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. /// /// \param __a /// A 64-bit integer containing one of the source operands. @@ -800,7 +2354,7 @@ _mm_mul_su32(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction. +/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. /// /// \param __a /// A [2 x i64] vector containing one of the source operands. @@ -821,7 +2375,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction. +/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing one of the source operands. @@ -839,7 +2393,7 @@ _mm_sad_epu8(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction. +/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the minuends. @@ -857,7 +2411,7 @@ _mm_sub_epi8(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction. +/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the minuends. @@ -875,7 +2429,7 @@ _mm_sub_epi16(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction. +/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the minuends. @@ -894,7 +2448,7 @@ _mm_sub_epi32(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSUBQ instruction. +/// This intrinsic corresponds to the <c> PSUBQ </c> instruction. /// /// \param __a /// A 64-bit integer vector containing the minuend. @@ -912,7 +2466,7 @@ _mm_sub_si64(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction. +/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the minuends. @@ -933,7 +2487,7 @@ _mm_sub_epi64(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction. +/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the minuends. @@ -954,7 +2508,7 @@ _mm_subs_epi8(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction. +/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the minuends. @@ -974,7 +2528,7 @@ _mm_subs_epi16(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction. +/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the minuends. @@ -994,7 +2548,7 @@ _mm_subs_epu8(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction. +/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the minuends. @@ -1012,7 +2566,7 @@ _mm_subs_epu16(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPAND / PAND instruction. +/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. /// /// \param __a /// A 128-bit integer vector containing one of the source operands. @@ -1031,7 +2585,7 @@ _mm_and_si128(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPANDN / PANDN instruction. +/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. /// /// \param __a /// A 128-bit vector containing the left source operand. The one's complement @@ -1049,7 +2603,7 @@ _mm_andnot_si128(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPOR / POR instruction. +/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. /// /// \param __a /// A 128-bit integer vector containing one of the source operands. @@ -1067,7 +2621,7 @@ _mm_or_si128(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPXOR / PXOR instruction. +/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. /// /// \param __a /// A 128-bit integer vector containing one of the source operands. @@ -1090,13 +2644,13 @@ _mm_xor_si128(__m128i __a, __m128i __b) /// __m128i _mm_slli_si128(__m128i a, const int imm); /// \endcode /// -/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction. +/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. /// /// \param a /// A 128-bit integer vector containing the source operand. /// \param imm -/// An immediate value specifying the number of bytes to left-shift -/// operand a. +/// An immediate value specifying the number of bytes to left-shift operand +/// \a a. /// \returns A 128-bit integer vector containing the left-shifted value. #define _mm_slli_si128(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector( \ @@ -1127,13 +2681,13 @@ _mm_xor_si128(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. +/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// An integer value specifying the number of bits to left-shift each value -/// in operand __a. +/// in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count) @@ -1146,13 +2700,13 @@ _mm_slli_epi16(__m128i __a, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. +/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to left-shift each value in operand __a. +/// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count) @@ -1165,13 +2719,13 @@ _mm_sll_epi16(__m128i __a, __m128i __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. +/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// An integer value specifying the number of bits to left-shift each value -/// in operand __a. +/// in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count) @@ -1184,13 +2738,13 @@ _mm_slli_epi32(__m128i __a, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. +/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to left-shift each value in operand __a. +/// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count) @@ -1203,13 +2757,13 @@ _mm_sll_epi32(__m128i __a, __m128i __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. +/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// An integer value specifying the number of bits to left-shift each value -/// in operand __a. +/// in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count) @@ -1222,13 +2776,13 @@ _mm_slli_epi64(__m128i __a, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. +/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to left-shift each value in operand __a. +/// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count) @@ -1242,13 +2796,13 @@ _mm_sll_epi64(__m128i __a, __m128i __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. +/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// An integer value specifying the number of bits to right-shift each value -/// in operand __a. +/// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count) @@ -1262,13 +2816,13 @@ _mm_srai_epi16(__m128i __a, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. +/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand __a. +/// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count) @@ -1282,13 +2836,13 @@ _mm_sra_epi16(__m128i __a, __m128i __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. +/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// An integer value specifying the number of bits to right-shift each value -/// in operand __a. +/// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count) @@ -1302,13 +2856,13 @@ _mm_srai_epi32(__m128i __a, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. +/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand __a. +/// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count) @@ -1325,13 +2879,13 @@ _mm_sra_epi32(__m128i __a, __m128i __count) /// __m128i _mm_srli_si128(__m128i a, const int imm); /// \endcode /// -/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction. +/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. /// /// \param a /// A 128-bit integer vector containing the source operand. /// \param imm /// An immediate value specifying the number of bytes to right-shift operand -/// a. +/// \a a. /// \returns A 128-bit integer vector containing the right-shifted value. #define _mm_srli_si128(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector( \ @@ -1362,13 +2916,13 @@ _mm_sra_epi32(__m128i __a, __m128i __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. +/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// An integer value specifying the number of bits to right-shift each value -/// in operand __a. +/// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count) @@ -1381,13 +2935,13 @@ _mm_srli_epi16(__m128i __a, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. +/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand __a. +/// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count) @@ -1400,13 +2954,13 @@ _mm_srl_epi16(__m128i __a, __m128i __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. +/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// An integer value specifying the number of bits to right-shift each value -/// in operand __a. +/// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count) @@ -1419,13 +2973,13 @@ _mm_srli_epi32(__m128i __a, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. +/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand __a. +/// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count) @@ -1438,13 +2992,13 @@ _mm_srl_epi32(__m128i __a, __m128i __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. +/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// An integer value specifying the number of bits to right-shift each value -/// in operand __a. +/// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count) @@ -1457,13 +3011,13 @@ _mm_srli_epi64(__m128i __a, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. +/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. /// /// \param __a /// A 128-bit integer vector containing the source operand. /// \param __count /// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand __a. +/// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count) @@ -1477,7 +3031,7 @@ _mm_srl_epi64(__m128i __a, __m128i __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction. +/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1496,7 +3050,7 @@ _mm_cmpeq_epi8(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction. +/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1515,7 +3069,7 @@ _mm_cmpeq_epi16(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction. +/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1535,7 +3089,7 @@ _mm_cmpeq_epi32(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. +/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1557,7 +3111,7 @@ _mm_cmpgt_epi8(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. +/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1577,7 +3131,7 @@ _mm_cmpgt_epi16(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. +/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1597,7 +3151,7 @@ _mm_cmpgt_epi32(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. +/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1617,7 +3171,7 @@ _mm_cmplt_epi8(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. +/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1637,7 +3191,7 @@ _mm_cmplt_epi16(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. +/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1658,7 +3212,7 @@ _mm_cmplt_epi32(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction. +/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. /// /// \param __a /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are @@ -1680,7 +3234,7 @@ _mm_cvtsi64_sd(__m128d __a, long long __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction. +/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. /// /// \param __a /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the @@ -1697,7 +3251,8 @@ _mm_cvtsd_si64(__m128d __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction. +/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> +/// instruction. /// /// \param __a /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the @@ -1714,7 +3269,7 @@ _mm_cvttsd_si64(__m128d __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction. +/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. /// /// \param __a /// A 128-bit integer vector. @@ -1729,7 +3284,7 @@ _mm_cvtepi32_ps(__m128i __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction. +/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1746,7 +3301,8 @@ _mm_cvtps_epi32(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction. +/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> +/// instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1762,7 +3318,7 @@ _mm_cvttps_epi32(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. +/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. /// /// \param __a /// A 32-bit signed integer operand. @@ -1779,7 +3335,7 @@ _mm_cvtsi32_si128(int __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. /// /// \param __a /// A 64-bit signed integer operand containing the value to be converted. @@ -1796,7 +3352,7 @@ _mm_cvtsi64_si128(long long __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. +/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. /// /// \param __a /// A vector of [4 x i32]. The least significant 32 bits are moved to the @@ -1815,7 +3371,7 @@ _mm_cvtsi128_si32(__m128i __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. /// /// \param __a /// A vector of [2 x i64]. The least significant 64 bits are moved to the @@ -1833,7 +3389,7 @@ _mm_cvtsi128_si64(__m128i __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction. +/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. /// /// \param __p /// An aligned pointer to a memory location containing integer values. @@ -1849,7 +3405,7 @@ _mm_load_si128(__m128i const *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction. +/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. /// /// \param __p /// A pointer to a memory location containing integer values. @@ -1868,7 +3424,7 @@ _mm_loadu_si128(__m128i const *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. /// /// \param __p /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of @@ -2154,42 +3710,170 @@ _mm_set1_epi8(char __b) return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; } +/// \brief Constructs a 128-bit integer vector, initialized in reverse order +/// with the specified 64-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> +/// instruction. +/// +/// \param __q0 +/// A 64-bit integral value used to initialize the lower 64 bits of the +/// result. +/// \param __q1 +/// A 64-bit integral value used to initialize the upper 64 bits of the +/// result. +/// \returns An initialized 128-bit integer vector. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1) { return (__m128i){ (long long)__q0, (long long)__q1 }; } +/// \brief Constructs a 128-bit integer vector, initialized in reverse order +/// with the specified 32-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i0 +/// A 32-bit integral value used to initialize bits [31:0] of the result. +/// \param __i1 +/// A 32-bit integral value used to initialize bits [63:32] of the result. +/// \param __i2 +/// A 32-bit integral value used to initialize bits [95:64] of the result. +/// \param __i3 +/// A 32-bit integral value used to initialize bits [127:96] of the result. +/// \returns An initialized 128-bit integer vector. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) { return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; } +/// \brief Constructs a 128-bit integer vector, initialized in reverse order +/// with the specified 16-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w0 +/// A 16-bit integral value used to initialize bits [15:0] of the result. +/// \param __w1 +/// A 16-bit integral value used to initialize bits [31:16] of the result. +/// \param __w2 +/// A 16-bit integral value used to initialize bits [47:32] of the result. +/// \param __w3 +/// A 16-bit integral value used to initialize bits [63:48] of the result. +/// \param __w4 +/// A 16-bit integral value used to initialize bits [79:64] of the result. +/// \param __w5 +/// A 16-bit integral value used to initialize bits [95:80] of the result. +/// \param __w6 +/// A 16-bit integral value used to initialize bits [111:96] of the result. +/// \param __w7 +/// A 16-bit integral value used to initialize bits [127:112] of the result. +/// \returns An initialized 128-bit integer vector. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) { return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; } +/// \brief Constructs a 128-bit integer vector, initialized in reverse order +/// with the specified 8-bit integral values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b0 +/// An 8-bit integral value used to initialize bits [7:0] of the result. +/// \param __b1 +/// An 8-bit integral value used to initialize bits [15:8] of the result. +/// \param __b2 +/// An 8-bit integral value used to initialize bits [23:16] of the result. +/// \param __b3 +/// An 8-bit integral value used to initialize bits [31:24] of the result. +/// \param __b4 +/// An 8-bit integral value used to initialize bits [39:32] of the result. +/// \param __b5 +/// An 8-bit integral value used to initialize bits [47:40] of the result. +/// \param __b6 +/// An 8-bit integral value used to initialize bits [55:48] of the result. +/// \param __b7 +/// An 8-bit integral value used to initialize bits [63:56] of the result. +/// \param __b8 +/// An 8-bit integral value used to initialize bits [71:64] of the result. +/// \param __b9 +/// An 8-bit integral value used to initialize bits [79:72] of the result. +/// \param __b10 +/// An 8-bit integral value used to initialize bits [87:80] of the result. +/// \param __b11 +/// An 8-bit integral value used to initialize bits [95:88] of the result. +/// \param __b12 +/// An 8-bit integral value used to initialize bits [103:96] of the result. +/// \param __b13 +/// An 8-bit integral value used to initialize bits [111:104] of the result. +/// \param __b14 +/// An 8-bit integral value used to initialize bits [119:112] of the result. +/// \param __b15 +/// An 8-bit integral value used to initialize bits [127:120] of the result. +/// \returns An initialized 128-bit integer vector. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) { return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; } +/// \brief Creates a 128-bit integer vector initialized to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. +/// +/// \returns An initialized 128-bit integer vector with all elements set to +/// zero. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) { return (__m128i){ 0LL, 0LL }; } +/// \brief Stores a 128-bit integer vector to a memory location aligned on a +/// 128-bit boundary. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. +/// +/// \param __p +/// A pointer to an aligned memory location that will receive the integer +/// values. +/// \param __b +/// A 128-bit integer vector containing the values to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b) { *__p = __b; } +/// \brief Stores a 128-bit integer vector to an unaligned memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the integer values. +/// \param __b +/// A 128-bit integer vector containing the values to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i *__p, __m128i __b) { @@ -2199,12 +3883,45 @@ _mm_storeu_si128(__m128i *__p, __m128i __b) ((struct __storeu_si128*)__p)->__v = __b; } +/// \brief Moves bytes selected by the mask from the first operand to the +/// specified unaligned memory location. When a mask bit is 1, the +/// corresponding byte is written, otherwise it is not written. To minimize +/// caching, the date is flagged as non-temporal (unlikely to be used again +/// soon). Exception and trap behavior for elements not selected for storage +/// to memory are implementation dependent. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> +/// instruction. +/// +/// \param __d +/// A 128-bit integer vector containing the values to be moved. +/// \param __n +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each byte represents the mask bits. +/// \param __p +/// A pointer to an unaligned 128-bit memory location where the specified +/// values are moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) { __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); } +/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to +/// a memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. +/// +/// \param __p +/// A pointer to a 64-bit memory location that will receive the lower 64 bits +/// of the integer vector parameter. +/// \param __a +/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the +/// value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i *__p, __m128i __a) { @@ -2214,18 +3931,54 @@ _mm_storel_epi64(__m128i *__p, __m128i __a) ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; } +/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit +/// aligned memory location. To minimize caching, the data is flagged as +/// non-temporal (unlikely to be used again soon). +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. +/// +/// \param __p +/// A pointer to the 128-bit aligned memory location used to store the value. +/// \param __a +/// A vector of [2 x double] containing the 64-bit values to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a) { __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); } +/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location. +/// To minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. +/// +/// \param __p +/// A pointer to the 128-bit aligned memory location used to store the value. +/// \param __a +/// A 128-bit integer vector containing the values to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a) { __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); } +/// \brief Stores a 32-bit integer value in the specified memory location. To +/// minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> MOVNTI </c> instruction. +/// +/// \param __p +/// A pointer to the 32-bit memory location used to store the value. +/// \param __a +/// A 32-bit integer containing the value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si32(int *__p, int __a) { @@ -2233,6 +3986,18 @@ _mm_stream_si32(int *__p, int __a) } #ifdef __x86_64__ +/// \brief Stores a 64-bit integer value in the specified memory location. To +/// minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. +/// +/// \param __p +/// A pointer to the 64-bit memory location used to store the value. +/// \param __a +/// A 64-bit integer containing the value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si64(long long *__p, long long __a) { @@ -2240,42 +4005,154 @@ _mm_stream_si64(long long *__p, long long __a) } #endif -static __inline__ void __DEFAULT_FN_ATTRS -_mm_clflush(void const *__p) -{ - __builtin_ia32_clflush(__p); -} +#if defined(__cplusplus) +extern "C" { +#endif -static __inline__ void __DEFAULT_FN_ATTRS -_mm_lfence(void) -{ - __builtin_ia32_lfence(); -} +/// \brief The cache line containing \a __p is flushed and invalidated from all +/// caches in the coherency domain. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. +/// +/// \param __p +/// A pointer to the memory location used to identify the cache line to be +/// flushed. +void _mm_clflush(void const *); -static __inline__ void __DEFAULT_FN_ATTRS -_mm_mfence(void) -{ - __builtin_ia32_mfence(); -} +/// \brief Forces strong memory ordering (serialization) between load +/// instructions preceding this instruction and load instructions following +/// this instruction, ensuring the system completes all previous loads before +/// executing subsequent loads. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> LFENCE </c> instruction. +/// +void _mm_lfence(void); +/// \brief Forces strong memory ordering (serialization) between load and store +/// instructions preceding this instruction and load and store instructions +/// following this instruction, ensuring that the system completes all +/// previous memory accesses before executing subsequent memory accesses. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> MFENCE </c> instruction. +/// +void _mm_mfence(void); + +#if defined(__cplusplus) +} // extern "C" +#endif + +/// \brief Converts 16-bit signed integers from both 128-bit integer vector +/// operands into 8-bit signed integers, and packs the results into the +/// destination. Positive values greater than 0x7F are saturated to 0x7F. +/// Negative values less than 0x80 are saturated to 0x80. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. +/// +/// \param __a +/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as +/// a signed integer and is converted to a 8-bit signed integer with +/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less +/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are +/// written to the lower 64 bits of the result. +/// \param __b +/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as +/// a signed integer and is converted to a 8-bit signed integer with +/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less +/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are +/// written to the higher 64 bits of the result. +/// \returns A 128-bit vector of [16 x i8] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); } +/// \brief Converts 32-bit signed integers from both 128-bit integer vector +/// operands into 16-bit signed integers, and packs the results into the +/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. +/// Negative values less than 0x8000 are saturated to 0x8000. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. +/// +/// \param __a +/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as +/// a signed integer and is converted to a 16-bit signed integer with +/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values +/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values +/// are written to the lower 64 bits of the result. +/// \param __b +/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as +/// a signed integer and is converted to a 16-bit signed integer with +/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values +/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values +/// are written to the higher 64 bits of the result. +/// \returns A 128-bit vector of [8 x i16] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); } +/// \brief Converts 16-bit signed integers from both 128-bit integer vector +/// operands into 8-bit unsigned integers, and packs the results into the +/// destination. Values greater than 0xFF are saturated to 0xFF. Values less +/// than 0x00 are saturated to 0x00. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. +/// +/// \param __a +/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as +/// a signed integer and is converted to an 8-bit unsigned integer with +/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less +/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are +/// written to the lower 64 bits of the result. +/// \param __b +/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as +/// a signed integer and is converted to an 8-bit unsigned integer with +/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less +/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are +/// written to the higher 64 bits of the result. +/// \returns A 128-bit vector of [16 x i8] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); } +/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using +/// the immediate-value parameter as a selector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __imm +/// An immediate value. Bits [3:0] selects values from \a __a to be assigned +/// to bits[15:0] of the result. \n +/// 000: assign values from bits [15:0] of \a __a. \n +/// 001: assign values from bits [31:16] of \a __a. \n +/// 010: assign values from bits [47:32] of \a __a. \n +/// 011: assign values from bits [63:48] of \a __a. \n +/// 100: assign values from bits [79:64] of \a __a. \n +/// 101: assign values from bits [95:80] of \a __a. \n +/// 110: assign values from bits [111:96] of \a __a. \n +/// 111: assign values from bits [127:112] of \a __a. +/// \returns An integer, whose lower 16 bits are selected from the 128-bit +/// integer vector parameter and the remaining bits are assigned zeros. static __inline__ int __DEFAULT_FN_ATTRS _mm_extract_epi16(__m128i __a, int __imm) { @@ -2283,6 +4160,26 @@ _mm_extract_epi16(__m128i __a, int __imm) return (unsigned short)__b[__imm & 7]; } +/// \brief Constructs a 128-bit integer vector by first making a copy of the +/// 128-bit integer vector parameter, and then inserting the lower 16 bits +/// of an integer parameter into an offset specified by the immediate-value +/// parameter. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. +/// +/// \param __a +/// A 128-bit integer vector of [8 x i16]. This vector is copied to the +/// result and then one of the eight elements in the result is replaced by +/// the lower 16 bits of \a __b. +/// \param __b +/// An integer. The lower 16 bits of this parameter are written to the +/// result beginning at an offset specified by \a __imm. +/// \param __imm +/// An immediate value specifying the bit offset in the result at which the +/// lower 16 bits of \a __b are written. +/// \returns A 128-bit integer vector containing the constructed values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_insert_epi16(__m128i __a, int __b, int __imm) { @@ -2291,18 +4188,85 @@ _mm_insert_epi16(__m128i __a, int __b, int __imm) return (__m128i)__c; } +/// \brief Copies the values of the most significant bits from each 8-bit +/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask +/// value, zero-extends the value, and writes it to the destination. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values with bits to be extracted. +/// \returns The most significant bits from each 8-bit element in \a __a, +/// written to bits [15:0]. The other bits are assigned zeros. static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) { return __builtin_ia32_pmovmskb128((__v16qi)__a); } +/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit +/// elements of a 128-bit integer vector parameter, using the immediate-value +/// parameter as a specifier. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_shuffle_epi32(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. +/// +/// \param a +/// A 128-bit integer vector containing the values to be copied. +/// \param imm +/// An immediate value containing an 8-bit value specifying which elements to +/// copy from a. The destinations within the 128-bit destination are assigned +/// values as follows: \n +/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n +/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n +/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n +/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n +/// Bit value assignments: \n +/// 00: assign values from bits [31:0] of \a a. \n +/// 01: assign values from bits [63:32] of \a a. \n +/// 10: assign values from bits [95:64] of \a a. \n +/// 11: assign values from bits [127:96] of \a a. +/// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ (__v4si)_mm_undefined_si128(), \ ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); }) +/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit +/// elements of a 128-bit integer vector of [8 x i16], using the immediate +/// value parameter as a specifier. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. +/// +/// \param a +/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits +/// [127:64] of the result. +/// \param imm +/// An 8-bit immediate value specifying which elements to copy from \a a. \n +/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n +/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n +/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n +/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n +/// Bit value assignments: \n +/// 00: assign values from bits [15:0] of \a a. \n +/// 01: assign values from bits [31:16] of \a a. \n +/// 10: assign values from bits [47:32] of \a a. \n +/// 11: assign values from bits [63:48] of \a a. \n +/// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ (__v8hi)_mm_undefined_si128(), \ @@ -2310,6 +4274,33 @@ _mm_movemask_epi8(__m128i __a) ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \ 4, 5, 6, 7); }) +/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit +/// elements of a 128-bit integer vector of [8 x i16], using the immediate +/// value parameter as a specifier. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. +/// +/// \param a +/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits +/// [63:0] of the result. +/// \param imm +/// An 8-bit immediate value specifying which elements to copy from \a a. \n +/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n +/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n +/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n +/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n +/// Bit value assignments: \n +/// 00: assign values from bits [79:64] of \a a. \n +/// 01: assign values from bits [95:80] of \a a. \n +/// 10: assign values from bits [111:96] of \a a. \n +/// 11: assign values from bits [127:112] of \a a. \n +/// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ (__v8hi)_mm_undefined_si128(), \ @@ -2319,137 +4310,480 @@ _mm_movemask_epi8(__m128i __a) 4 + (((imm) >> 4) & 0x3), \ 4 + (((imm) >> 6) & 0x3)); }) +/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors +/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. +/// Bits [71:64] are written to bits [7:0] of the result. \n +/// Bits [79:72] are written to bits [23:16] of the result. \n +/// Bits [87:80] are written to bits [39:32] of the result. \n +/// Bits [95:88] are written to bits [55:48] of the result. \n +/// Bits [103:96] are written to bits [71:64] of the result. \n +/// Bits [111:104] are written to bits [87:80] of the result. \n +/// Bits [119:112] are written to bits [103:96] of the result. \n +/// Bits [127:120] are written to bits [119:112] of the result. +/// \param __b +/// A 128-bit vector of [16 x i8]. \n +/// Bits [71:64] are written to bits [15:8] of the result. \n +/// Bits [79:72] are written to bits [31:24] of the result. \n +/// Bits [87:80] are written to bits [47:40] of the result. \n +/// Bits [95:88] are written to bits [63:56] of the result. \n +/// Bits [103:96] are written to bits [79:72] of the result. \n +/// Bits [111:104] are written to bits [95:88] of the result. \n +/// Bits [119:112] are written to bits [111:104] of the result. \n +/// Bits [127:120] are written to bits [127:120] of the result. +/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); } +/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of +/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// Bits [79:64] are written to bits [15:0] of the result. \n +/// Bits [95:80] are written to bits [47:32] of the result. \n +/// Bits [111:96] are written to bits [79:64] of the result. \n +/// Bits [127:112] are written to bits [111:96] of the result. +/// \param __b +/// A 128-bit vector of [8 x i16]. +/// Bits [79:64] are written to bits [31:16] of the result. \n +/// Bits [95:80] are written to bits [63:48] of the result. \n +/// Bits [111:96] are written to bits [95:80] of the result. \n +/// Bits [127:112] are written to bits [127:112] of the result. +/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); } +/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of +/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. \n +/// Bits [95:64] are written to bits [31:0] of the destination. \n +/// Bits [127:96] are written to bits [95:64] of the destination. +/// \param __b +/// A 128-bit vector of [4 x i32]. \n +/// Bits [95:64] are written to bits [64:32] of the destination. \n +/// Bits [127:96] are written to bits [127:96] of the destination. +/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); } +/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors +/// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. \n +/// Bits [127:64] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x i64]. \n +/// Bits [127:64] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); } +/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of +/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. \n +/// Bits [7:0] are written to bits [7:0] of the result. \n +/// Bits [15:8] are written to bits [23:16] of the result. \n +/// Bits [23:16] are written to bits [39:32] of the result. \n +/// Bits [31:24] are written to bits [55:48] of the result. \n +/// Bits [39:32] are written to bits [71:64] of the result. \n +/// Bits [47:40] are written to bits [87:80] of the result. \n +/// Bits [55:48] are written to bits [103:96] of the result. \n +/// Bits [63:56] are written to bits [119:112] of the result. +/// \param __b +/// A 128-bit vector of [16 x i8]. +/// Bits [7:0] are written to bits [15:8] of the result. \n +/// Bits [15:8] are written to bits [31:24] of the result. \n +/// Bits [23:16] are written to bits [47:40] of the result. \n +/// Bits [31:24] are written to bits [63:56] of the result. \n +/// Bits [39:32] are written to bits [79:72] of the result. \n +/// Bits [47:40] are written to bits [95:88] of the result. \n +/// Bits [55:48] are written to bits [111:104] of the result. \n +/// Bits [63:56] are written to bits [127:120] of the result. +/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); } +/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit +/// vectors of [8 x i16] and interleaves them into a 128-bit vector of +/// [8 x i16]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// Bits [15:0] are written to bits [15:0] of the result. \n +/// Bits [31:16] are written to bits [47:32] of the result. \n +/// Bits [47:32] are written to bits [79:64] of the result. \n +/// Bits [63:48] are written to bits [111:96] of the result. +/// \param __b +/// A 128-bit vector of [8 x i16]. +/// Bits [15:0] are written to bits [31:16] of the result. \n +/// Bits [31:16] are written to bits [63:48] of the result. \n +/// Bits [47:32] are written to bits [95:80] of the result. \n +/// Bits [63:48] are written to bits [127:112] of the result. +/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); } +/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of +/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. \n +/// Bits [31:0] are written to bits [31:0] of the destination. \n +/// Bits [63:32] are written to bits [95:64] of the destination. +/// \param __b +/// A 128-bit vector of [4 x i32]. \n +/// Bits [31:0] are written to bits [64:32] of the destination. \n +/// Bits [63:32] are written to bits [127:96] of the destination. +/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); } +/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of +/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. \n +/// Bits [63:0] are written to bits [63:0] of the destination. \n +/// \param __b +/// A 128-bit vector of [2 x i64]. \n +/// Bits [63:0] are written to bits [127:64] of the destination. \n +/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); } +/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit +/// integer. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector operand. The lower 64 bits are moved to the +/// destination. +/// \returns A 64-bit integer containing the lower 64 bits of the parameter. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) { return (__m64)__a[0]; } +/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the +/// upper bits. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction. +/// +/// \param __a +/// A 64-bit value. +/// \returns A 128-bit integer vector. The lower 64 bits contain the value from +/// the operand. The upper 64 bits are assigned zeros. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) { return (__m128i){ (long long)__a, 0 }; } +/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit +/// integer vector, zeroing the upper bits. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. +/// +/// \param __a +/// A 128-bit integer vector operand. The lower 64 bits are moved to the +/// destination. +/// \returns A 128-bit integer vector. The lower 64 bits contain the value from +/// the operand. The upper 64 bits are assigned zeros. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); } +/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors +/// of [2 x double] and interleaves them into a 128-bit vector of [2 x +/// double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. \n +/// Bits [127:64] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x double]. \n +/// Bits [127:64] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the interleaved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b) { return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); } +/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors +/// of [2 x double] and interleaves them into a 128-bit vector of [2 x +/// double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. \n +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x double]. \n +/// Bits [63:0] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the interleaved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b) { return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); } +/// \brief Extracts the sign bits of the double-precision values in the 128-bit +/// vector of [2 x double], zero-extends the value, and writes it to the +/// low-order bits of the destination. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the values with sign bits to +/// be extracted. +/// \returns The sign bits from each of the double-precision elements in \a __a, +/// written to bits [1:0]. The remaining bits are assigned values of zero. static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) { return __builtin_ia32_movmskpd((__v2df)__a); } + +/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two +/// 128-bit vector parameters of [2 x double], using the immediate-value +/// parameter as a specifier. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. +/// +/// \param a +/// A 128-bit vector of [2 x double]. +/// \param b +/// A 128-bit vector of [2 x double]. +/// \param i +/// An 8-bit immediate value. The least significant two bits specify which +/// elements to copy from a and b: \n +/// Bit[0] = 0: lower element of a copied to lower element of result. \n +/// Bit[0] = 1: upper element of a copied to lower element of result. \n +/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n +/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n +/// \returns A 128-bit vector of [2 x double] containing the shuffled values. #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 0 + (((i) >> 0) & 0x1), \ 2 + (((i) >> 1) & 0x1)); }) +/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit +/// floating-point vector of [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [2 x double]. +/// \returns A 128-bit floating-point vector of [4 x float] containing the same +/// bitwise pattern as the parameter. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) { return (__m128)__a; } +/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit +/// integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [2 x double]. +/// \returns A 128-bit integer vector containing the same bitwise pattern as the +/// parameter. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) { return (__m128i)__a; } +/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit +/// floating-point vector of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. +/// \returns A 128-bit floating-point vector of [2 x double] containing the same +/// bitwise pattern as the parameter. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) { return (__m128d)__a; } +/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit +/// integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. +/// \returns A 128-bit integer vector containing the same bitwise pattern as the +/// parameter. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) { return (__m128i)__a; } +/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector +/// of [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 128-bit floating-point vector of [4 x float] containing the same +/// bitwise pattern as the parameter. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) { return (__m128)__a; } +/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector +/// of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 128-bit floating-point vector of [2 x double] containing the same +/// bitwise pattern as the parameter. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) { return (__m128d)__a; } -static __inline__ void __DEFAULT_FN_ATTRS -_mm_pause(void) -{ - __builtin_ia32_pause(); -} +#if defined(__cplusplus) +extern "C" { +#endif +/// \brief Indicates that a spin loop is being executed for the purposes of +/// optimizing power consumption during the loop. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> PAUSE </c> instruction. +/// +void _mm_pause(void); + +#if defined(__cplusplus) +} // extern "C" +#endif #undef __DEFAULT_FN_ATTRS #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) diff --git a/lib/Headers/f16cintrin.h b/lib/Headers/f16cintrin.h index 415bf732fb9f..180712ffc680 100644 --- a/lib/Headers/f16cintrin.h +++ b/lib/Headers/f16cintrin.h @@ -37,7 +37,7 @@ /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTPH2PS instruction. +/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction. /// /// \param __a /// A 16-bit half-precision float value. @@ -59,17 +59,17 @@ _cvtsh_ss(unsigned short __a) /// unsigned short _cvtss_sh(float a, const int imm); /// \endcode /// -/// This intrinsic corresponds to the \c VCVTPS2PH instruction. +/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction. /// /// \param a /// A 32-bit single-precision float value to be converted to a 16-bit /// half-precision float value. /// \param imm -/// An immediate value controlling rounding using bits [2:0]: -/// 000: Nearest -/// 001: Down -/// 010: Up -/// 011: Truncate +/// An immediate value controlling rounding using bits [2:0]: \n +/// 000: Nearest \n +/// 001: Down \n +/// 010: Up \n +/// 011: Truncate \n /// 1XX: Use MXCSR.RC for rounding /// \returns The converted 16-bit half-precision float value. #define _cvtss_sh(a, imm) \ @@ -85,16 +85,16 @@ _cvtsh_ss(unsigned short __a) /// __m128i _mm_cvtps_ph(__m128 a, const int imm); /// \endcode /// -/// This intrinsic corresponds to the \c VCVTPS2PH instruction. +/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction. /// /// \param a /// A 128-bit vector containing 32-bit float values. /// \param imm -/// An immediate value controlling rounding using bits [2:0]: -/// 000: Nearest -/// 001: Down -/// 010: Up -/// 011: Truncate +/// An immediate value controlling rounding using bits [2:0]: \n +/// 000: Nearest \n +/// 001: Down \n +/// 010: Up \n +/// 011: Truncate \n /// 1XX: Use MXCSR.RC for rounding /// \returns A 128-bit vector containing converted 16-bit half-precision float /// values. The lower 64 bits are used to store the converted 16-bit @@ -107,7 +107,7 @@ _cvtsh_ss(unsigned short __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTPH2PS instruction. +/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction. /// /// \param __a /// A 128-bit vector containing 16-bit half-precision float values. The lower diff --git a/lib/Headers/float.h b/lib/Headers/float.h index a28269ebebbe..0f453d87cbcb 100644 --- a/lib/Headers/float.h +++ b/lib/Headers/float.h @@ -27,9 +27,12 @@ /* If we're on MinGW, fall back to the system's float.h, which might have * additional definitions provided for Windows. * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx + * + * Also fall back on Darwin to allow additional definitions and + * implementation-defined values. */ -#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \ - __has_include_next(<float.h>) +#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \ + __STDC_HOSTED__ && __has_include_next(<float.h>) # include_next <float.h> /* Undefine anything that we'll be redefining below. */ diff --git a/lib/Headers/fxsrintrin.h b/lib/Headers/fxsrintrin.h index ac6026aa5ba2..786081ca8eab 100644 --- a/lib/Headers/fxsrintrin.h +++ b/lib/Headers/fxsrintrin.h @@ -30,25 +30,75 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr"))) +/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte +/// memory region pointed to by the input parameter \a __p. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> FXSAVE </c> instruction. +/// +/// \param __p +/// A pointer to a 512-byte memory region. The beginning of this memory +/// region should be aligned on a 16-byte boundary. static __inline__ void __DEFAULT_FN_ATTRS -_fxsave(void *__p) { +_fxsave(void *__p) +{ return __builtin_ia32_fxsave(__p); } +/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte +/// memory region pointed to by the input parameter \a __p. The contents of +/// this memory region should have been written to by a previous \c _fxsave +/// or \c _fxsave64 intrinsic. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction. +/// +/// \param __p +/// A pointer to a 512-byte memory region. The beginning of this memory +/// region should be aligned on a 16-byte boundary. static __inline__ void __DEFAULT_FN_ATTRS -_fxsave64(void *__p) { - return __builtin_ia32_fxsave64(__p); +_fxrstor(void *__p) +{ + return __builtin_ia32_fxrstor(__p); } +#ifdef __x86_64__ +/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte +/// memory region pointed to by the input parameter \a __p. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction. +/// +/// \param __p +/// A pointer to a 512-byte memory region. The beginning of this memory +/// region should be aligned on a 16-byte boundary. static __inline__ void __DEFAULT_FN_ATTRS -_fxrstor(void *__p) { - return __builtin_ia32_fxrstor(__p); +_fxsave64(void *__p) +{ + return __builtin_ia32_fxsave64(__p); } +/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte +/// memory region pointed to by the input parameter \a __p. The contents of +/// this memory region should have been written to by a previous \c _fxsave +/// or \c _fxsave64 intrinsic. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction. +/// +/// \param __p +/// A pointer to a 512-byte memory region. The beginning of this memory +/// region should be aligned on a 16-byte boundary. static __inline__ void __DEFAULT_FN_ATTRS -_fxrstor64(void *__p) { +_fxrstor64(void *__p) +{ return __builtin_ia32_fxrstor64(__p); } +#endif #undef __DEFAULT_FN_ATTRS diff --git a/lib/Headers/ia32intrin.h b/lib/Headers/ia32intrin.h index 397f3fd13e01..4928300103ad 100644 --- a/lib/Headers/ia32intrin.h +++ b/lib/Headers/ia32intrin.h @@ -60,12 +60,6 @@ __rdpmc(int __A) { return __builtin_ia32_rdpmc(__A); } -/* __rdtsc */ -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) -__rdtsc(void) { - return __builtin_ia32_rdtsc(); -} - /* __rdtscp */ static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) __rdtscp(unsigned int *__A) { diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h index 4b2752353d6f..7f91d49fbcec 100644 --- a/lib/Headers/immintrin.h +++ b/lib/Headers/immintrin.h @@ -69,9 +69,44 @@ Intel documents these as being in immintrin.h, and they depend on typedefs from avxintrin.h. */ +/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector +/// containing 16-bit half-precision float values. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm256_cvtps_ph(__m256 a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction. +/// +/// \param a +/// A 256-bit vector containing 32-bit single-precision float values to be +/// converted to 16-bit half-precision float values. +/// \param imm +/// An immediate value controlling rounding using bits [2:0]: \n +/// 000: Nearest \n +/// 001: Down \n +/// 010: Up \n +/// 011: Truncate \n +/// 1XX: Use MXCSR.RC for rounding +/// \returns A 128-bit vector containing the converted 16-bit half-precision +/// float values. #define _mm256_cvtps_ph(a, imm) __extension__ ({ \ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); }) +/// \brief Converts a 128-bit vector containing 16-bit half-precision float +/// values into a 256-bit vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction. +/// +/// \param __a +/// A 128-bit vector containing 16-bit half-precision float values to be +/// converted to 32-bit single-precision float values. +/// \returns A vector of [8 x float] containing the converted 32-bit +/// single-precision float values. static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c"))) _mm256_cvtph_ps(__m128i __a) { diff --git a/lib/Headers/intrin.h b/lib/Headers/intrin.h index f18711ad1ecf..7c91ebaee8cb 100644 --- a/lib/Headers/intrin.h +++ b/lib/Headers/intrin.h @@ -34,6 +34,10 @@ #include <x86intrin.h> #endif +#if defined(__arm__) +#include <armintr.h> +#endif + /* For the definition of jmp_buf. */ #if __STDC_HOSTED__ #include <setjmp.h> @@ -62,7 +66,9 @@ void __cpuid(int[4], int); static __inline__ void __cpuidex(int[4], int, int); void __debugbreak(void); +static __inline__ __int64 __emul(int, int); +static __inline__ unsigned __int64 __emulu(unsigned int, unsigned int); void __cdecl __fastfail(unsigned int); unsigned int __getcallerseflags(void); @@ -93,6 +99,7 @@ static __inline__ void __movsd(unsigned long *, unsigned long const *, size_t); static __inline__ void __movsw(unsigned short *, unsigned short const *, size_t); +static __inline__ void __nop(void); void __nvreg_restore_fence(void); void __nvreg_save_fence(void); @@ -249,10 +256,12 @@ static __inline__ unsigned long __cdecl _lrotl(unsigned long, int); static __inline__ unsigned long __cdecl _lrotr(unsigned long, int); -static __inline__ -void _ReadBarrier(void); -static __inline__ -void _ReadWriteBarrier(void); +static __inline__ void +__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) +_ReadBarrier(void); +static __inline__ void +__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) +_ReadWriteBarrier(void); static __inline__ void *_ReturnAddress(void); unsigned int _rorx_u32(unsigned int, const unsigned int); @@ -281,8 +290,9 @@ unsigned int _shrx_u32(unsigned int, unsigned int); void _Store_HLERelease(long volatile *, long); void _Store64_HLERelease(__int64 volatile *, __int64); void _StorePointer_HLERelease(void *volatile *, void *); -static __inline__ -void _WriteBarrier(void); +static __inline__ void +__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) +_WriteBarrier(void); unsigned __int32 xbegin(void); void _xend(void); static __inline__ @@ -307,7 +317,6 @@ void __lwpval64(unsigned __int64, unsigned int, unsigned int); unsigned __int64 __lzcnt64(unsigned __int64); static __inline__ void __movsq(unsigned long long *, unsigned long long const *, size_t); -__int64 __mulh(__int64, __int64); static __inline__ unsigned __int64 __popcnt64(unsigned __int64); static __inline__ @@ -378,30 +387,15 @@ void *_InterlockedCompareExchangePointer(void *volatile *_Destination, void *_Exchange, void *_Comparand); void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination, void *_Exchange, void *_Comparand); -static __inline__ -__int64 _InterlockedDecrement64(__int64 volatile *_Addend); -static __inline__ -__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value); -static __inline__ -__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value); void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value); -static __inline__ -__int64 _InterlockedIncrement64(__int64 volatile *_Addend); long _InterlockedOr_np(long volatile *_Value, long _Mask); short _InterlockedOr16_np(short volatile *_Value, short _Mask); -static __inline__ -__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask); __int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask); char _InterlockedOr8_np(char volatile *_Value, char _Mask); long _InterlockedXor_np(long volatile *_Value, long _Mask); short _InterlockedXor16_np(short volatile *_Value, short _Mask); -static __inline__ -__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask); __int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask); char _InterlockedXor8_np(char volatile *_Value, char _Mask); -static __inline__ -__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand, - __int64 *_HighProduct); unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int); __int64 _sarx_i64(__int64, unsigned int); #if __STDC_HOSTED__ @@ -409,119 +403,44 @@ int __cdecl _setjmpex(jmp_buf); #endif unsigned __int64 _shlx_u64(unsigned __int64, unsigned int); unsigned __int64 _shrx_u64(unsigned __int64, unsigned int); -/* - * Multiply two 64-bit integers and obtain a 64-bit result. - * The low-half is returned directly and the high half is in an out parameter. - */ -static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS -_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand, - unsigned __int64 *_HighProduct) { - unsigned __int128 _FullProduct = - (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand; - *_HighProduct = _FullProduct >> 64; - return _FullProduct; -} -static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS -__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) { - unsigned __int128 _FullProduct = - (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand; - return _FullProduct >> 64; -} +static __inline__ +__int64 __mulh(__int64, __int64); +static __inline__ +unsigned __int64 __umulh(unsigned __int64, unsigned __int64); +static __inline__ +__int64 _mul128(__int64, __int64, __int64*); +static __inline__ +unsigned __int64 _umul128(unsigned __int64, + unsigned __int64, + unsigned __int64*); #endif /* __x86_64__ */ -/*----------------------------------------------------------------------------*\ -|* Multiplication -\*----------------------------------------------------------------------------*/ -static __inline__ __int64 __DEFAULT_FN_ATTRS -__emul(int __in1, int __in2) { - return (__int64)__in1 * (__int64)__in2; -} -static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS -__emulu(unsigned int __in1, unsigned int __in2) { - return (unsigned __int64)__in1 * (unsigned __int64)__in2; -} -/*----------------------------------------------------------------------------*\ -|* Bit Twiddling -\*----------------------------------------------------------------------------*/ -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_rotl8(unsigned char _Value, unsigned char _Shift) { - _Shift &= 0x7; - return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_rotr8(unsigned char _Value, unsigned char _Shift) { - _Shift &= 0x7; - return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value; -} -static __inline__ unsigned short __DEFAULT_FN_ATTRS -_rotl16(unsigned short _Value, unsigned char _Shift) { - _Shift &= 0xf; - return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value; -} -static __inline__ unsigned short __DEFAULT_FN_ATTRS -_rotr16(unsigned short _Value, unsigned char _Shift) { - _Shift &= 0xf; - return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value; -} -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_rotl(unsigned int _Value, int _Shift) { - _Shift &= 0x1f; - return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value; -} -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_rotr(unsigned int _Value, int _Shift) { - _Shift &= 0x1f; - return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value; -} -static __inline__ unsigned long __DEFAULT_FN_ATTRS -_lrotl(unsigned long _Value, int _Shift) { - _Shift &= 0x1f; - return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value; -} -static __inline__ unsigned long __DEFAULT_FN_ATTRS -_lrotr(unsigned long _Value, int _Shift) { - _Shift &= 0x1f; - return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value; -} -static -__inline__ unsigned __int64 __DEFAULT_FN_ATTRS -_rotl64(unsigned __int64 _Value, int _Shift) { - _Shift &= 0x3f; - return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value; -} -static -__inline__ unsigned __int64 __DEFAULT_FN_ATTRS -_rotr64(unsigned __int64 _Value, int _Shift) { - _Shift &= 0x3f; - return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value; -} +#if defined(__x86_64__) || defined(__arm__) + +static __inline__ +__int64 _InterlockedDecrement64(__int64 volatile *_Addend); +static __inline__ +__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value); +static __inline__ +__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value); +static __inline__ +__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value); +static __inline__ +__int64 _InterlockedIncrement64(__int64 volatile *_Addend); +static __inline__ +__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask); +static __inline__ +__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask); +static __inline__ +__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask); + +#endif + /*----------------------------------------------------------------------------*\ |* Bit Counting and Testing \*----------------------------------------------------------------------------*/ static __inline__ unsigned char __DEFAULT_FN_ATTRS -_BitScanForward(unsigned long *_Index, unsigned long _Mask) { - if (!_Mask) - return 0; - *_Index = __builtin_ctzl(_Mask); - return 1; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_BitScanReverse(unsigned long *_Index, unsigned long _Mask) { - if (!_Mask) - return 0; - *_Index = 31 - __builtin_clzl(_Mask); - return 1; -} -static __inline__ unsigned short __DEFAULT_FN_ATTRS -__popcnt16(unsigned short _Value) { - return __builtin_popcount((int)_Value); -} -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__popcnt(unsigned int _Value) { - return __builtin_popcount(_Value); -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS _bittest(long const *_BitBase, long _BitPos) { return (*_BitBase >> _BitPos) & 1; } @@ -548,26 +467,24 @@ _interlockedbittestandset(long volatile *_BitBase, long _BitPos) { long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST); return (_PrevVal >> _BitPos) & 1; } -#ifdef __x86_64__ +#if defined(__arm__) || defined(__aarch64__) static __inline__ unsigned char __DEFAULT_FN_ATTRS -_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) { - if (!_Mask) - return 0; - *_Index = __builtin_ctzll(_Mask); - return 1; +_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) { + long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE); + return (_PrevVal >> _BitPos) & 1; } static __inline__ unsigned char __DEFAULT_FN_ATTRS -_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) { - if (!_Mask) - return 0; - *_Index = 63 - __builtin_clzll(_Mask); - return 1; +_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) { + long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED); + return (_PrevVal >> _BitPos) & 1; } -static __inline__ -unsigned __int64 __DEFAULT_FN_ATTRS -__popcnt64(unsigned __int64 _Value) { - return __builtin_popcountll(_Value); +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) { + long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE); + return (_PrevVal >> _BitPos) & 1; } +#endif +#ifdef __x86_64__ static __inline__ unsigned char __DEFAULT_FN_ATTRS _bittest64(__int64 const *_BitBase, __int64 _BitPos) { return (*_BitBase >> _BitPos) & 1; @@ -600,196 +517,449 @@ _interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) { /*----------------------------------------------------------------------------*\ |* Interlocked Exchange Add \*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST); +_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE); } -static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST); -} -#ifdef __x86_64__ -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) { - return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST); +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); } -#endif -/*----------------------------------------------------------------------------*\ -|* Interlocked Exchange Sub -\*----------------------------------------------------------------------------*/ static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchangeSub8(char volatile *_Subend, char _Value) { - return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST); +_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); } static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchangeSub16(short volatile *_Subend, short _Value) { - return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST); +_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE); } static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchangeSub(long volatile *_Subend, long _Value) { - return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST); +_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE); } -#ifdef __x86_64__ static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) { - return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST); +_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) { + return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE); } #endif /*----------------------------------------------------------------------------*\ |* Interlocked Increment \*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedIncrement16(short volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST); +_InterlockedIncrement16_acq(short volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedIncrement16_nf(short volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedIncrement16_rel(short volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedIncrement_acq(long volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedIncrement_nf(long volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedIncrement_rel(long volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE); } -#ifdef __x86_64__ static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedIncrement64(__int64 volatile *_Value) { - return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST); +_InterlockedIncrement64_acq(__int64 volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedIncrement64_nf(__int64 volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedIncrement64_rel(__int64 volatile *_Value) { + return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE); } #endif /*----------------------------------------------------------------------------*\ |* Interlocked Decrement \*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedDecrement16(short volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST); +_InterlockedDecrement16_acq(short volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedDecrement16_nf(short volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedDecrement16_rel(short volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedDecrement_acq(long volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedDecrement_nf(long volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedDecrement_rel(long volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedDecrement64_acq(__int64 volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE); } -#ifdef __x86_64__ static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedDecrement64(__int64 volatile *_Value) { - return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST); +_InterlockedDecrement64_nf(__int64 volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedDecrement64_rel(__int64 volatile *_Value) { + return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE); } #endif /*----------------------------------------------------------------------------*\ |* Interlocked And \*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedAnd8(char volatile *_Value, char _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedAnd8_acq(char volatile *_Value, char _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedAnd8_nf(char volatile *_Value, char _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED); +} +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedAnd8_rel(char volatile *_Value, char _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedAnd16_acq(short volatile *_Value, short _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedAnd16_nf(short volatile *_Value, short _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED); } static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedAnd16(short volatile *_Value, short _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedAnd16_rel(short volatile *_Value, short _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE); } static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedAnd(long volatile *_Value, long _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedAnd_acq(long volatile *_Value, long _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedAnd_nf(long volatile *_Value, long _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedAnd_rel(long volatile *_Value, long _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE); } -#ifdef __x86_64__ static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE); } #endif /*----------------------------------------------------------------------------*\ |* Interlocked Or \*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedOr8_acq(char volatile *_Value, char _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedOr8_nf(char volatile *_Value, char _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED); +} static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedOr8(char volatile *_Value, char _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedOr8_rel(char volatile *_Value, char _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedOr16_acq(short volatile *_Value, short _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedOr16_nf(short volatile *_Value, short _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED); } static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedOr16(short volatile *_Value, short _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedOr16_rel(short volatile *_Value, short _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE); } static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedOr(long volatile *_Value, long _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedOr_acq(long volatile *_Value, long _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedOr_nf(long volatile *_Value, long _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedOr_rel(long volatile *_Value, long _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED); } -#ifdef __x86_64__ static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE); } #endif /*----------------------------------------------------------------------------*\ |* Interlocked Xor \*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedXor8_acq(char volatile *_Value, char _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedXor8_nf(char volatile *_Value, char _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED); +} static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedXor8(char volatile *_Value, char _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedXor8_rel(char volatile *_Value, char _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedXor16_acq(short volatile *_Value, short _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedXor16_nf(short volatile *_Value, short _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED); } static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedXor16(short volatile *_Value, short _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedXor16_rel(short volatile *_Value, short _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE); } static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedXor(long volatile *_Value, long _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedXor_acq(long volatile *_Value, long _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedXor_nf(long volatile *_Value, long _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED); +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedXor_rel(long volatile *_Value, long _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE); +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED); } -#ifdef __x86_64__ static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) { - return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST); +_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) { + return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE); } #endif /*----------------------------------------------------------------------------*\ |* Interlocked Exchange \*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedExchange8(char volatile *_Target, char _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST); +_InterlockedExchange8_acq(char volatile *_Target, char _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE); + return _Value; +} +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedExchange8_nf(char volatile *_Target, char _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED); + return _Value; +} +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedExchange8_rel(char volatile *_Target, char _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE); return _Value; } static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedExchange16(short volatile *_Target, short _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST); +_InterlockedExchange16_acq(short volatile *_Target, short _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE); + return _Value; +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedExchange16_nf(short volatile *_Target, short _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED); + return _Value; +} +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedExchange16_rel(short volatile *_Target, short _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE); + return _Value; +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchange_acq(long volatile *_Target, long _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE); + return _Value; +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchange_nf(long volatile *_Target, long _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED); + return _Value; +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchange_rel(long volatile *_Target, long _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE); + return _Value; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE); + return _Value; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED); return _Value; } -#ifdef __x86_64__ static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) { - __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST); +_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) { + __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE); return _Value; } #endif /*----------------------------------------------------------------------------*\ |* Interlocked Compare Exchange \*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) static __inline__ char __DEFAULT_FN_ATTRS -_InterlockedCompareExchange8(char volatile *_Destination, +_InterlockedCompareExchange8_acq(char volatile *_Destination, char _Exchange, char _Comparand) { __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); + return _Comparand; +} +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedCompareExchange8_nf(char volatile *_Destination, + char _Exchange, char _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); + return _Comparand; +} +static __inline__ char __DEFAULT_FN_ATTRS +_InterlockedCompareExchange8_rel(char volatile *_Destination, + char _Exchange, char _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_RELEASE); return _Comparand; } static __inline__ short __DEFAULT_FN_ATTRS -_InterlockedCompareExchange16(short volatile *_Destination, +_InterlockedCompareExchange16_acq(short volatile *_Destination, short _Exchange, short _Comparand) { __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); return _Comparand; } -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedCompareExchange64(__int64 volatile *_Destination, - __int64 _Exchange, __int64 _Comparand) { +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedCompareExchange16_nf(short volatile *_Destination, + short _Exchange, short _Comparand) { __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, - __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); return _Comparand; } -/*----------------------------------------------------------------------------*\ -|* Barriers -\*----------------------------------------------------------------------------*/ -static __inline__ void __DEFAULT_FN_ATTRS -__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) -_ReadWriteBarrier(void) { - __atomic_signal_fence(__ATOMIC_SEQ_CST); +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedCompareExchange16_rel(short volatile *_Destination, + short _Exchange, short _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_RELEASE); + return _Comparand; } -static __inline__ void __DEFAULT_FN_ATTRS -__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) -_ReadBarrier(void) { - __atomic_signal_fence(__ATOMIC_SEQ_CST); +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedCompareExchange_acq(long volatile *_Destination, + long _Exchange, long _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); + return _Comparand; } -static __inline__ void __DEFAULT_FN_ATTRS -__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) -_WriteBarrier(void) { - __atomic_signal_fence(__ATOMIC_SEQ_CST); +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedCompareExchange_nf(long volatile *_Destination, + long _Exchange, long _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); + return _Comparand; } -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS -__faststorefence(void) { - __atomic_thread_fence(__ATOMIC_SEQ_CST); +static __inline__ short __DEFAULT_FN_ATTRS +_InterlockedCompareExchange_rel(long volatile *_Destination, + long _Exchange, long _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_RELEASE); + return _Comparand; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedCompareExchange64_acq(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); + return _Comparand; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedCompareExchange64_nf(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); + return _Comparand; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedCompareExchange64_rel(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand) { + __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, + __ATOMIC_SEQ_CST, __ATOMIC_RELEASE); + return _Comparand; } #endif /*----------------------------------------------------------------------------*\ @@ -840,59 +1010,39 @@ __readgsqword(unsigned long __offset) { #if defined(__i386__) || defined(__x86_64__) static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) { - __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n) - : "%edi", "%esi", "%ecx"); + __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)); } static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) { - __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n) - : "%edi", "%esi", "%ecx"); + __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)); } static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) { - __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n) - : "%edi", "%esi", "%ecx"); -} -static __inline__ void __DEFAULT_FN_ATTRS -__stosb(unsigned char *__dst, unsigned char __x, size_t __n) { - __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n) - : "%edi", "%ecx"); + __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)); } static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst, unsigned long __x, size_t __n) { - __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n) - : "%edi", "%ecx"); + __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)); } static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst, unsigned short __x, size_t __n) { - __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n) - : "%edi", "%ecx"); + __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)); } #endif #ifdef __x86_64__ static __inline__ void __DEFAULT_FN_ATTRS __movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) { - __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n) - : "%edi", "%esi", "%ecx"); + __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)); } static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) { - __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n) - : "%edi", "%ecx"); + __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)); } #endif /*----------------------------------------------------------------------------*\ |* Misc \*----------------------------------------------------------------------------*/ -static __inline__ void * __DEFAULT_FN_ATTRS -_AddressOfReturnAddress(void) { - return (void*)((char*)__builtin_frame_address(0) + sizeof(void*)); -} -static __inline__ void * __DEFAULT_FN_ATTRS -_ReturnAddress(void) { - return __builtin_return_address(0); -} #if defined(__i386__) || defined(__x86_64__) static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) { @@ -914,6 +1064,10 @@ static __inline__ void __DEFAULT_FN_ATTRS __halt(void) { __asm__ volatile ("hlt"); } +static __inline__ void __DEFAULT_FN_ATTRS +__nop(void) { + __asm__ volatile ("nop"); +} #endif /*----------------------------------------------------------------------------*\ diff --git a/lib/Headers/lzcntintrin.h b/lib/Headers/lzcntintrin.h index 4c00e42ac3a9..3d2769da3bae 100644 --- a/lib/Headers/lzcntintrin.h +++ b/lib/Headers/lzcntintrin.h @@ -31,18 +31,48 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) +/// \brief Counts the number of leading zero bits in the operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 16-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 16-bit integer containing the number of leading zero +/// bits in the operand. static __inline__ unsigned short __DEFAULT_FN_ATTRS __lzcnt16(unsigned short __X) { return __X ? __builtin_clzs(__X) : 16; } +/// \brief Counts the number of leading zero bits in the operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 32-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 32-bit integer containing the number of leading zero +/// bits in the operand. static __inline__ unsigned int __DEFAULT_FN_ATTRS __lzcnt32(unsigned int __X) { return __X ? __builtin_clz(__X) : 32; } +/// \brief Counts the number of leading zero bits in the operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 32-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 32-bit integer containing the number of leading zero +/// bits in the operand. static __inline__ unsigned int __DEFAULT_FN_ATTRS _lzcnt_u32(unsigned int __X) { @@ -50,12 +80,32 @@ _lzcnt_u32(unsigned int __X) } #ifdef __x86_64__ +/// \brief Counts the number of leading zero bits in the operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 64-bit integer containing the number of leading zero +/// bits in the operand. static __inline__ unsigned long long __DEFAULT_FN_ATTRS __lzcnt64(unsigned long long __X) { return __X ? __builtin_clzll(__X) : 64; } +/// \brief Counts the number of leading zero bits in the operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 64-bit integer containing the number of leading zero +/// bits in the operand. static __inline__ unsigned long long __DEFAULT_FN_ATTRS _lzcnt_u64(unsigned long long __X) { diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h index cefd6053aa80..e0c277a65a33 100644 --- a/lib/Headers/mmintrin.h +++ b/lib/Headers/mmintrin.h @@ -39,7 +39,7 @@ typedef char __v8qi __attribute__((__vector_size__(8))); /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c EMMS instruction. +/// This intrinsic corresponds to the <c> EMMS </c> instruction. /// static __inline__ void __DEFAULT_FN_ATTRS _mm_empty(void) @@ -52,7 +52,7 @@ _mm_empty(void) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. +/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. /// /// \param __i /// A 32-bit integer value. @@ -69,7 +69,7 @@ _mm_cvtsi32_si64(int __i) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. +/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. /// /// \param __m /// A 64-bit integer vector. @@ -85,7 +85,7 @@ _mm_cvtsi64_si32(__m64 __m) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction. +/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction. /// /// \param __i /// A 64-bit signed integer. @@ -101,7 +101,7 @@ _mm_cvtsi64_m64(long long __i) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction. +/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction. /// /// \param __m /// A 64-bit integer vector. @@ -121,7 +121,7 @@ _mm_cvtm64_si64(__m64 __m) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PACKSSWB instruction. +/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a @@ -151,7 +151,7 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PACKSSDW instruction. +/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a @@ -181,7 +181,7 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PACKUSWB instruction. +/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a @@ -208,19 +208,19 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PUNPCKHBW instruction. +/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction. /// /// \param __m1 -/// A 64-bit integer vector of [8 x i8]. -/// Bits [39:32] are written to bits [7:0] of the result. -/// Bits [47:40] are written to bits [23:16] of the result. -/// Bits [55:48] are written to bits [39:32] of the result. +/// A 64-bit integer vector of [8 x i8]. \n +/// Bits [39:32] are written to bits [7:0] of the result. \n +/// Bits [47:40] are written to bits [23:16] of the result. \n +/// Bits [55:48] are written to bits [39:32] of the result. \n /// Bits [63:56] are written to bits [55:48] of the result. /// \param __m2 /// A 64-bit integer vector of [8 x i8]. -/// Bits [39:32] are written to bits [15:8] of the result. -/// Bits [47:40] are written to bits [31:24] of the result. -/// Bits [55:48] are written to bits [47:40] of the result. +/// Bits [39:32] are written to bits [15:8] of the result. \n +/// Bits [47:40] are written to bits [31:24] of the result. \n +/// Bits [55:48] are written to bits [47:40] of the result. \n /// Bits [63:56] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. @@ -235,15 +235,15 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PUNPCKHWD instruction. +/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. -/// Bits [47:32] are written to bits [15:0] of the result. +/// Bits [47:32] are written to bits [15:0] of the result. \n /// Bits [63:48] are written to bits [47:32] of the result. /// \param __m2 /// A 64-bit integer vector of [4 x i16]. -/// Bits [47:32] are written to bits [31:16] of the result. +/// Bits [47:32] are written to bits [31:16] of the result. \n /// Bits [63:48] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. @@ -258,7 +258,7 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PUNPCKHDQ instruction. +/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to @@ -279,19 +279,19 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PUNPCKLBW instruction. +/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8]. -/// Bits [7:0] are written to bits [7:0] of the result. -/// Bits [15:8] are written to bits [23:16] of the result. -/// Bits [23:16] are written to bits [39:32] of the result. +/// Bits [7:0] are written to bits [7:0] of the result. \n +/// Bits [15:8] are written to bits [23:16] of the result. \n +/// Bits [23:16] are written to bits [39:32] of the result. \n /// Bits [31:24] are written to bits [55:48] of the result. /// \param __m2 /// A 64-bit integer vector of [8 x i8]. -/// Bits [7:0] are written to bits [15:8] of the result. -/// Bits [15:8] are written to bits [31:24] of the result. -/// Bits [23:16] are written to bits [47:40] of the result. +/// Bits [7:0] are written to bits [15:8] of the result. \n +/// Bits [15:8] are written to bits [31:24] of the result. \n +/// Bits [23:16] are written to bits [47:40] of the result. \n /// Bits [31:24] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. @@ -306,15 +306,15 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PUNPCKLWD instruction. +/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. -/// Bits [15:0] are written to bits [15:0] of the result. +/// Bits [15:0] are written to bits [15:0] of the result. \n /// Bits [31:16] are written to bits [47:32] of the result. /// \param __m2 /// A 64-bit integer vector of [4 x i16]. -/// Bits [15:0] are written to bits [31:16] of the result. +/// Bits [15:0] are written to bits [31:16] of the result. \n /// Bits [31:16] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. @@ -329,7 +329,7 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PUNPCKLDQ instruction. +/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to @@ -352,7 +352,7 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PADDB instruction. +/// This intrinsic corresponds to the <c> PADDB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8]. @@ -373,7 +373,7 @@ _mm_add_pi8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PADDW instruction. +/// This intrinsic corresponds to the <c> PADDW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. @@ -394,7 +394,7 @@ _mm_add_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PADDD instruction. +/// This intrinsic corresponds to the <c> PADDD </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [2 x i32]. @@ -416,7 +416,7 @@ _mm_add_pi32(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PADDSB instruction. +/// This intrinsic corresponds to the <c> PADDSB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8]. @@ -439,7 +439,7 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PADDSW instruction. +/// This intrinsic corresponds to the <c> PADDSW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. @@ -461,7 +461,7 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PADDUSB instruction. +/// This intrinsic corresponds to the <c> PADDUSB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8]. @@ -483,7 +483,7 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PADDUSW instruction. +/// This intrinsic corresponds to the <c> PADDUSW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. @@ -504,7 +504,7 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSUBB instruction. +/// This intrinsic corresponds to the <c> PSUBB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8] containing the minuends. @@ -525,7 +525,7 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSUBW instruction. +/// This intrinsic corresponds to the <c> PSUBW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16] containing the minuends. @@ -546,7 +546,7 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSUBD instruction. +/// This intrinsic corresponds to the <c> PSUBD </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [2 x i32] containing the minuends. @@ -569,7 +569,7 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSUBSB instruction. +/// This intrinsic corresponds to the <c> PSUBSB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8] containing the minuends. @@ -592,7 +592,7 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSUBSW instruction. +/// This intrinsic corresponds to the <c> PSUBSW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16] containing the minuends. @@ -615,7 +615,7 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSUBUSB instruction. +/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8] containing the minuends. @@ -638,7 +638,7 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSUBUSW instruction. +/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16] containing the minuends. @@ -663,7 +663,7 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMADDWD instruction. +/// This intrinsic corresponds to the <c> PMADDWD </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. @@ -684,7 +684,7 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMULHW instruction. +/// This intrinsic corresponds to the <c> PMULHW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. @@ -705,7 +705,7 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMULLW instruction. +/// This intrinsic corresponds to the <c> PMULLW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. @@ -727,14 +727,15 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSLLW instruction. +/// This intrinsic corresponds to the <c> PSLLW </c> instruction. /// /// \param __m /// A 64-bit integer vector of [4 x i16]. /// \param __count /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted -/// values. If __count is greater or equal to 16, the result is set to all 0. +/// values. If \a __count is greater or equal to 16, the result is set to all +/// 0. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi16(__m64 __m, __m64 __count) { @@ -748,14 +749,15 @@ _mm_sll_pi16(__m64 __m, __m64 __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSLLW instruction. +/// This intrinsic corresponds to the <c> PSLLW </c> instruction. /// /// \param __m /// A 64-bit integer vector of [4 x i16]. /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted -/// values. If __count is greater or equal to 16, the result is set to all 0. +/// values. If \a __count is greater or equal to 16, the result is set to all +/// 0. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_pi16(__m64 __m, int __count) { @@ -770,14 +772,15 @@ _mm_slli_pi16(__m64 __m, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSLLD instruction. +/// This intrinsic corresponds to the <c> PSLLD </c> instruction. /// /// \param __m /// A 64-bit integer vector of [2 x i32]. /// \param __count /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted -/// values. If __count is greater or equal to 32, the result is set to all 0. +/// values. If \a __count is greater or equal to 32, the result is set to all +/// 0. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi32(__m64 __m, __m64 __count) { @@ -791,14 +794,15 @@ _mm_sll_pi32(__m64 __m, __m64 __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSLLD instruction. +/// This intrinsic corresponds to the <c> PSLLD </c> instruction. /// /// \param __m /// A 64-bit integer vector of [2 x i32]. /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted -/// values. If __count is greater or equal to 32, the result is set to all 0. +/// values. If \a __count is greater or equal to 32, the result is set to all +/// 0. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_pi32(__m64 __m, int __count) { @@ -811,14 +815,14 @@ _mm_slli_pi32(__m64 __m, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSLLQ instruction. +/// This intrinsic corresponds to the <c> PSLLQ </c> instruction. /// /// \param __m /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \param __count /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector containing the left-shifted value. If -/// __count is greater or equal to 64, the result is set to 0. +/// \a __count is greater or equal to 64, the result is set to 0. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_si64(__m64 __m, __m64 __count) { @@ -831,14 +835,14 @@ _mm_sll_si64(__m64 __m, __m64 __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSLLQ instruction. +/// This intrinsic corresponds to the <c> PSLLQ </c> instruction. /// /// \param __m /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the left-shifted value. If -/// __count is greater or equal to 64, the result is set to 0. +/// \a __count is greater or equal to 64, the result is set to 0. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_si64(__m64 __m, int __count) { @@ -854,7 +858,7 @@ _mm_slli_si64(__m64 __m, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRAW instruction. +/// This intrinsic corresponds to the <c> PSRAW </c> instruction. /// /// \param __m /// A 64-bit integer vector of [4 x i16]. @@ -876,7 +880,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRAW instruction. +/// This intrinsic corresponds to the <c> PSRAW </c> instruction. /// /// \param __m /// A 64-bit integer vector of [4 x i16]. @@ -899,7 +903,7 @@ _mm_srai_pi16(__m64 __m, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRAD instruction. +/// This intrinsic corresponds to the <c> PSRAD </c> instruction. /// /// \param __m /// A 64-bit integer vector of [2 x i32]. @@ -921,7 +925,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRAD instruction. +/// This intrinsic corresponds to the <c> PSRAD </c> instruction. /// /// \param __m /// A 64-bit integer vector of [2 x i32]. @@ -943,7 +947,7 @@ _mm_srai_pi32(__m64 __m, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRLW instruction. +/// This intrinsic corresponds to the <c> PSRLW </c> instruction. /// /// \param __m /// A 64-bit integer vector of [4 x i16]. @@ -964,7 +968,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRLW instruction. +/// This intrinsic corresponds to the <c> PSRLW </c> instruction. /// /// \param __m /// A 64-bit integer vector of [4 x i16]. @@ -986,7 +990,7 @@ _mm_srli_pi16(__m64 __m, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRLD instruction. +/// This intrinsic corresponds to the <c> PSRLD </c> instruction. /// /// \param __m /// A 64-bit integer vector of [2 x i32]. @@ -1007,7 +1011,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRLD instruction. +/// This intrinsic corresponds to the <c> PSRLD </c> instruction. /// /// \param __m /// A 64-bit integer vector of [2 x i32]. @@ -1027,7 +1031,7 @@ _mm_srli_pi32(__m64 __m, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRLQ instruction. +/// This intrinsic corresponds to the <c> PSRLQ </c> instruction. /// /// \param __m /// A 64-bit integer vector interpreted as a single 64-bit integer. @@ -1046,7 +1050,7 @@ _mm_srl_si64(__m64 __m, __m64 __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSRLQ instruction. +/// This intrinsic corresponds to the <c> PSRLQ </c> instruction. /// /// \param __m /// A 64-bit integer vector interpreted as a single 64-bit integer. @@ -1063,7 +1067,7 @@ _mm_srli_si64(__m64 __m, int __count) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PAND instruction. +/// This intrinsic corresponds to the <c> PAND </c> instruction. /// /// \param __m1 /// A 64-bit integer vector. @@ -1083,7 +1087,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PANDN instruction. +/// This intrinsic corresponds to the <c> PANDN </c> instruction. /// /// \param __m1 /// A 64-bit integer vector. The one's complement of this parameter is used @@ -1102,7 +1106,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c POR instruction. +/// This intrinsic corresponds to the <c> POR </c> instruction. /// /// \param __m1 /// A 64-bit integer vector. @@ -1120,7 +1124,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PXOR instruction. +/// This intrinsic corresponds to the <c> PXOR </c> instruction. /// /// \param __m1 /// A 64-bit integer vector. @@ -1141,7 +1145,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PCMPEQB instruction. +/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8]. @@ -1162,7 +1166,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PCMPEQW instruction. +/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. @@ -1183,7 +1187,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PCMPEQD instruction. +/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [2 x i32]. @@ -1204,7 +1208,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PCMPGTB instruction. +/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [8 x i8]. @@ -1225,7 +1229,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PCMPGTW instruction. +/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [4 x i16]. @@ -1246,7 +1250,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PCMPGTD instruction. +/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction. /// /// \param __m1 /// A 64-bit integer vector of [2 x i32]. @@ -1264,7 +1268,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the the \c VXORPS / XORPS instruction. +/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction. /// /// \returns An initialized 64-bit integer vector with all elements set to zero. static __inline__ __m64 __DEFAULT_FN_ATTRS @@ -1356,7 +1360,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction. +/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. /// /// \param __i /// A 32-bit integer value used to initialize each vector element of the @@ -1374,7 +1378,7 @@ _mm_set1_pi32(int __i) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction. +/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. /// /// \param __w /// A 16-bit integer value used to initialize each vector element of the @@ -1391,8 +1395,8 @@ _mm_set1_pi16(short __w) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPUNPCKLBW + VPSHUFLW / \c PUNPCKLBW + -/// PSHUFLW instruction. +/// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW + +/// PSHUFLW </c> instruction. /// /// \param __b /// An 8-bit integer value used to initialize each vector element of the diff --git a/lib/Headers/module.modulemap b/lib/Headers/module.modulemap index 3e40d2c08d8c..11ef2f902945 100644 --- a/lib/Headers/module.modulemap +++ b/lib/Headers/module.modulemap @@ -63,11 +63,13 @@ module _Builtin_intrinsics [system] [extern_c] { textual header "mwaitxintrin.h" explicit module mm_malloc { + requires !freestanding header "mm_malloc.h" export * // note: for <stdlib.h> dependency } explicit module cpuid { + requires gnuinlineasm header "cpuid.h" } diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h index 802927490e7f..0c25d312709d 100644 --- a/lib/Headers/opencl-c.h +++ b/lib/Headers/opencl-c.h @@ -17,6 +17,7 @@ #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 #define __ovld __attribute__((overloadable)) +#define __conv __attribute__((convergent)) // Optimizations #define __purefn __attribute__((pure)) @@ -9810,14 +9811,6 @@ float3 __ovld __cnfn native_cos(float3 x); float4 __ovld __cnfn native_cos(float4 x); float8 __ovld __cnfn native_cos(float8 x); float16 __ovld __cnfn native_cos(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_cos(double x); -double2 __ovld __cnfn native_cos(double2 x); -double3 __ovld __cnfn native_cos(double3 x); -double4 __ovld __cnfn native_cos(double4 x); -double8 __ovld __cnfn native_cos(double8 x); -double16 __ovld __cnfn native_cos(double16 x); -#endif //cl_khr_fp64 /** * Compute x / y over an implementation-defined range. @@ -9829,14 +9822,6 @@ float3 __ovld __cnfn native_divide(float3 x, float3 y); float4 __ovld __cnfn native_divide(float4 x, float4 y); float8 __ovld __cnfn native_divide(float8 x, float8 y); float16 __ovld __cnfn native_divide(float16 x, float16 y); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_divide(double x, double y); -double2 __ovld __cnfn native_divide(double2 x, double2 y); -double3 __ovld __cnfn native_divide(double3 x, double3 y); -double4 __ovld __cnfn native_divide(double4 x, double4 y); -double8 __ovld __cnfn native_divide(double8 x, double8 y); -double16 __ovld __cnfn native_divide(double16 x, double16 y); -#endif //cl_khr_fp64 /** * Compute the base- e exponential of x over an @@ -9849,14 +9834,6 @@ float3 __ovld __cnfn native_exp(float3 x); float4 __ovld __cnfn native_exp(float4 x); float8 __ovld __cnfn native_exp(float8 x); float16 __ovld __cnfn native_exp(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_exp(double x); -double2 __ovld __cnfn native_exp(double2 x); -double3 __ovld __cnfn native_exp(double3 x); -double4 __ovld __cnfn native_exp(double4 x); -double8 __ovld __cnfn native_exp(double8 x); -double16 __ovld __cnfn native_exp(double16 x); -#endif //cl_khr_fp64 /** * Compute the base- 2 exponential of x over an @@ -9869,14 +9846,6 @@ float3 __ovld __cnfn native_exp2(float3 x); float4 __ovld __cnfn native_exp2(float4 x); float8 __ovld __cnfn native_exp2(float8 x); float16 __ovld __cnfn native_exp2(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_exp2(double x); -double2 __ovld __cnfn native_exp2(double2 x); -double3 __ovld __cnfn native_exp2(double3 x); -double4 __ovld __cnfn native_exp2(double4 x); -double8 __ovld __cnfn native_exp2(double8 x); -double16 __ovld __cnfn native_exp2(double16 x); -#endif //cl_khr_fp64 /** * Compute the base- 10 exponential of x over an @@ -9889,14 +9858,6 @@ float3 __ovld __cnfn native_exp10(float3 x); float4 __ovld __cnfn native_exp10(float4 x); float8 __ovld __cnfn native_exp10(float8 x); float16 __ovld __cnfn native_exp10(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_exp10(double x); -double2 __ovld __cnfn native_exp10(double2 x); -double3 __ovld __cnfn native_exp10(double3 x); -double4 __ovld __cnfn native_exp10(double4 x); -double8 __ovld __cnfn native_exp10(double8 x); -double16 __ovld __cnfn native_exp10(double16 x); -#endif //cl_khr_fp64 /** * Compute natural logarithm over an implementationdefined @@ -9909,14 +9870,6 @@ float3 __ovld __cnfn native_log(float3 x); float4 __ovld __cnfn native_log(float4 x); float8 __ovld __cnfn native_log(float8 x); float16 __ovld __cnfn native_log(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_log(double x); -double2 __ovld __cnfn native_log(double2 x); -double3 __ovld __cnfn native_log(double3 x); -double4 __ovld __cnfn native_log(double4 x); -double8 __ovld __cnfn native_log(double8 x); -double16 __ovld __cnfn native_log(double16 x); -#endif //cl_khr_fp64 /** * Compute a base 2 logarithm over an implementationdefined @@ -9928,14 +9881,6 @@ float3 __ovld __cnfn native_log2(float3 x); float4 __ovld __cnfn native_log2(float4 x); float8 __ovld __cnfn native_log2(float8 x); float16 __ovld __cnfn native_log2(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_log2(double x); -double2 __ovld __cnfn native_log2(double2 x); -double3 __ovld __cnfn native_log2(double3 x); -double4 __ovld __cnfn native_log2(double4 x); -double8 __ovld __cnfn native_log2(double8 x); -double16 __ovld __cnfn native_log2(double16 x); -#endif //cl_khr_fp64 /** * Compute a base 10 logarithm over an implementationdefined @@ -9947,14 +9892,6 @@ float3 __ovld __cnfn native_log10(float3 x); float4 __ovld __cnfn native_log10(float4 x); float8 __ovld __cnfn native_log10(float8 x); float16 __ovld __cnfn native_log10(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_log10(double x); -double2 __ovld __cnfn native_log10(double2 x); -double3 __ovld __cnfn native_log10(double3 x); -double4 __ovld __cnfn native_log10(double4 x); -double8 __ovld __cnfn native_log10(double8 x); -double16 __ovld __cnfn native_log10(double16 x); -#endif //cl_khr_fp64 /** * Compute x to the power y, where x is >= 0. The range of @@ -9967,14 +9904,6 @@ float3 __ovld __cnfn native_powr(float3 x, float3 y); float4 __ovld __cnfn native_powr(float4 x, float4 y); float8 __ovld __cnfn native_powr(float8 x, float8 y); float16 __ovld __cnfn native_powr(float16 x, float16 y); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_powr(double x, double y); -double2 __ovld __cnfn native_powr(double2 x, double2 y); -double3 __ovld __cnfn native_powr(double3 x, double3 y); -double4 __ovld __cnfn native_powr(double4 x, double4 y); -double8 __ovld __cnfn native_powr(double8 x, double8 y); -double16 __ovld __cnfn native_powr(double16 x, double16 y); -#endif //cl_khr_fp64 /** * Compute reciprocal over an implementation-defined @@ -9986,14 +9915,6 @@ float3 __ovld __cnfn native_recip(float3 x); float4 __ovld __cnfn native_recip(float4 x); float8 __ovld __cnfn native_recip(float8 x); float16 __ovld __cnfn native_recip(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_recip(double x); -double2 __ovld __cnfn native_recip(double2 x); -double3 __ovld __cnfn native_recip(double3 x); -double4 __ovld __cnfn native_recip(double4 x); -double8 __ovld __cnfn native_recip(double8 x); -double16 __ovld __cnfn native_recip(double16 x); -#endif //cl_khr_fp64 /** * Compute inverse square root over an implementationdefined @@ -10005,14 +9926,6 @@ float3 __ovld __cnfn native_rsqrt(float3 x); float4 __ovld __cnfn native_rsqrt(float4 x); float8 __ovld __cnfn native_rsqrt(float8 x); float16 __ovld __cnfn native_rsqrt(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_rsqrt(double x); -double2 __ovld __cnfn native_rsqrt(double2 x); -double3 __ovld __cnfn native_rsqrt(double3 x); -double4 __ovld __cnfn native_rsqrt(double4 x); -double8 __ovld __cnfn native_rsqrt(double8 x); -double16 __ovld __cnfn native_rsqrt(double16 x); -#endif //cl_khr_fp64 /** * Compute sine over an implementation-defined range. @@ -10024,14 +9937,6 @@ float3 __ovld __cnfn native_sin(float3 x); float4 __ovld __cnfn native_sin(float4 x); float8 __ovld __cnfn native_sin(float8 x); float16 __ovld __cnfn native_sin(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_sin(double x); -double2 __ovld __cnfn native_sin(double2 x); -double3 __ovld __cnfn native_sin(double3 x); -double4 __ovld __cnfn native_sin(double4 x); -double8 __ovld __cnfn native_sin(double8 x); -double16 __ovld __cnfn native_sin(double16 x); -#endif //cl_khr_fp64 /** * Compute square root over an implementation-defined @@ -10043,14 +9948,6 @@ float3 __ovld __cnfn native_sqrt(float3 x); float4 __ovld __cnfn native_sqrt(float4 x); float8 __ovld __cnfn native_sqrt(float8 x); float16 __ovld __cnfn native_sqrt(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_sqrt(double x); -double2 __ovld __cnfn native_sqrt(double2 x); -double3 __ovld __cnfn native_sqrt(double3 x); -double4 __ovld __cnfn native_sqrt(double4 x); -double8 __ovld __cnfn native_sqrt(double8 x); -double16 __ovld __cnfn native_sqrt(double16 x); -#endif //cl_khr_fp64 /** * Compute tangent over an implementation-defined range. @@ -10062,14 +9959,6 @@ float3 __ovld __cnfn native_tan(float3 x); float4 __ovld __cnfn native_tan(float4 x); float8 __ovld __cnfn native_tan(float8 x); float16 __ovld __cnfn native_tan(float16 x); -#ifdef cl_khr_fp64 -double __ovld __cnfn native_tan(double x); -double2 __ovld __cnfn native_tan(double2 x); -double3 __ovld __cnfn native_tan(double3 x); -double4 __ovld __cnfn native_tan(double4 x); -double8 __ovld __cnfn native_tan(double8 x); -double16 __ovld __cnfn native_tan(double16 x); -#endif //cl_khr_fp64 // OpenCL v1.1 s6.11.3, v1.2 s6.12.3, v2.0 s6.13.3 - Integer Functions @@ -13934,7 +13823,7 @@ typedef uint cl_mem_fence_flags; * image objects and then want to read the updated data. */ -void __ovld barrier(cl_mem_fence_flags flags); +void __ovld __conv barrier(cl_mem_fence_flags flags); #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -13947,8 +13836,8 @@ typedef enum memory_scope memory_scope_sub_group } memory_scope; -void __ovld work_group_barrier(cl_mem_fence_flags flags, memory_scope scope); -void __ovld work_group_barrier(cl_mem_fence_flags flags); +void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope); +void __ovld __conv work_group_barrier(cl_mem_fence_flags flags); #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 // OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions @@ -14728,6 +14617,13 @@ int __ovld atom_xor(volatile __local int *p, int val); unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val); #endif +#if defined(cl_khr_int64_extended_atomics) +long __ovld atom_xor(volatile __global long *p, long val); +unsigned long __ovld atom_xor(volatile __global unsigned long *p, unsigned long val); +long __ovld atom_xor(volatile __local long *p, long val); +unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long val); +#endif + #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable @@ -15564,9 +15460,11 @@ half16 __ovld __cnfn shuffle2(half8 x, half8 y, ushort16 mask); half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask); #endif //cl_khr_fp16 +#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf int printf(__constant const char* st, ...); +#endif // OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions @@ -15592,6 +15490,10 @@ int printf(__constant const char* st, ...); #define CLK_FILTER_NEAREST 0x10 #define CLK_FILTER_LINEAR 0x20 +#ifdef cl_khr_gl_msaa_sharing +#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable +#endif //cl_khr_gl_msaa_sharing + /** * Use the coordinate (coord.xy) to do an element lookup in * the 2D image object specified by image. @@ -16493,6 +16395,7 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept #define CLK_sRGBA 0x10C1 #define CLK_sRGBx 0x10C0 #define CLK_sBGRA 0x10C2 +#define CLK_ABGR 0x10C3 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 int __ovld __cnfn get_image_channel_order(read_only image1d_t image); @@ -16670,101 +16573,101 @@ int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image); // OpenCL v2.0 s6.13.15 - Work-group Functions #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -int __ovld work_group_all(int predicate); -int __ovld work_group_any(int predicate); +int __ovld __conv work_group_all(int predicate); +int __ovld __conv work_group_any(int predicate); #ifdef cl_khr_fp16 -half __ovld work_group_broadcast(half a, size_t local_id); -half __ovld work_group_broadcast(half a, size_t x, size_t y); -half __ovld work_group_broadcast(half a, size_t x, size_t y, size_t z); +half __ovld __conv work_group_broadcast(half a, size_t local_id); +half __ovld __conv work_group_broadcast(half a, size_t x, size_t y); +half __ovld __conv work_group_broadcast(half a, size_t x, size_t y, size_t z); #endif -int __ovld work_group_broadcast(int a, size_t local_id); -int __ovld work_group_broadcast(int a, size_t x, size_t y); -int __ovld work_group_broadcast(int a, size_t x, size_t y, size_t z); -uint __ovld work_group_broadcast(uint a, size_t local_id); -uint __ovld work_group_broadcast(uint a, size_t x, size_t y); -uint __ovld work_group_broadcast(uint a, size_t x, size_t y, size_t z); -long __ovld work_group_broadcast(long a, size_t local_id); -long __ovld work_group_broadcast(long a, size_t x, size_t y); -long __ovld work_group_broadcast(long a, size_t x, size_t y, size_t z); -ulong __ovld work_group_broadcast(ulong a, size_t local_id); -ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y); -ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y, size_t z); -float __ovld work_group_broadcast(float a, size_t local_id); -float __ovld work_group_broadcast(float a, size_t x, size_t y); -float __ovld work_group_broadcast(float a, size_t x, size_t y, size_t z); +int __ovld __conv work_group_broadcast(int a, size_t local_id); +int __ovld __conv work_group_broadcast(int a, size_t x, size_t y); +int __ovld __conv work_group_broadcast(int a, size_t x, size_t y, size_t z); +uint __ovld __conv work_group_broadcast(uint a, size_t local_id); +uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y); +uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y, size_t z); +long __ovld __conv work_group_broadcast(long a, size_t local_id); +long __ovld __conv work_group_broadcast(long a, size_t x, size_t y); +long __ovld __conv work_group_broadcast(long a, size_t x, size_t y, size_t z); +ulong __ovld __conv work_group_broadcast(ulong a, size_t local_id); +ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y); +ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z); +float __ovld __conv work_group_broadcast(float a, size_t local_id); +float __ovld __conv work_group_broadcast(float a, size_t x, size_t y); +float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z); #ifdef cl_khr_fp64 -double __ovld work_group_broadcast(double a, size_t local_id); -double __ovld work_group_broadcast(double a, size_t x, size_t y); -double __ovld work_group_broadcast(double a, size_t x, size_t y, size_t z); +double __ovld __conv work_group_broadcast(double a, size_t local_id); +double __ovld __conv work_group_broadcast(double a, size_t x, size_t y); +double __ovld __conv work_group_broadcast(double a, size_t x, size_t y, size_t z); #endif //cl_khr_fp64 #ifdef cl_khr_fp16 -half __ovld work_group_reduce_add(half x); -half __ovld work_group_reduce_min(half x); -half __ovld work_group_reduce_max(half x); -half __ovld work_group_scan_exclusive_add(half x); -half __ovld work_group_scan_exclusive_min(half x); -half __ovld work_group_scan_exclusive_max(half x); -half __ovld work_group_scan_inclusive_add(half x); -half __ovld work_group_scan_inclusive_min(half x); -half __ovld work_group_scan_inclusive_max(half x); +half __ovld __conv work_group_reduce_add(half x); +half __ovld __conv work_group_reduce_min(half x); +half __ovld __conv work_group_reduce_max(half x); +half __ovld __conv work_group_scan_exclusive_add(half x); +half __ovld __conv work_group_scan_exclusive_min(half x); +half __ovld __conv work_group_scan_exclusive_max(half x); +half __ovld __conv work_group_scan_inclusive_add(half x); +half __ovld __conv work_group_scan_inclusive_min(half x); +half __ovld __conv work_group_scan_inclusive_max(half x); #endif -int __ovld work_group_reduce_add(int x); -int __ovld work_group_reduce_min(int x); -int __ovld work_group_reduce_max(int x); -int __ovld work_group_scan_exclusive_add(int x); -int __ovld work_group_scan_exclusive_min(int x); -int __ovld work_group_scan_exclusive_max(int x); -int __ovld work_group_scan_inclusive_add(int x); -int __ovld work_group_scan_inclusive_min(int x); -int __ovld work_group_scan_inclusive_max(int x); -uint __ovld work_group_reduce_add(uint x); -uint __ovld work_group_reduce_min(uint x); -uint __ovld work_group_reduce_max(uint x); -uint __ovld work_group_scan_exclusive_add(uint x); -uint __ovld work_group_scan_exclusive_min(uint x); -uint __ovld work_group_scan_exclusive_max(uint x); -uint __ovld work_group_scan_inclusive_add(uint x); -uint __ovld work_group_scan_inclusive_min(uint x); -uint __ovld work_group_scan_inclusive_max(uint x); -long __ovld work_group_reduce_add(long x); -long __ovld work_group_reduce_min(long x); -long __ovld work_group_reduce_max(long x); -long __ovld work_group_scan_exclusive_add(long x); -long __ovld work_group_scan_exclusive_min(long x); -long __ovld work_group_scan_exclusive_max(long x); -long __ovld work_group_scan_inclusive_add(long x); -long __ovld work_group_scan_inclusive_min(long x); -long __ovld work_group_scan_inclusive_max(long x); -ulong __ovld work_group_reduce_add(ulong x); -ulong __ovld work_group_reduce_min(ulong x); -ulong __ovld work_group_reduce_max(ulong x); -ulong __ovld work_group_scan_exclusive_add(ulong x); -ulong __ovld work_group_scan_exclusive_min(ulong x); -ulong __ovld work_group_scan_exclusive_max(ulong x); -ulong __ovld work_group_scan_inclusive_add(ulong x); -ulong __ovld work_group_scan_inclusive_min(ulong x); -ulong __ovld work_group_scan_inclusive_max(ulong x); -float __ovld work_group_reduce_add(float x); -float __ovld work_group_reduce_min(float x); -float __ovld work_group_reduce_max(float x); -float __ovld work_group_scan_exclusive_add(float x); -float __ovld work_group_scan_exclusive_min(float x); -float __ovld work_group_scan_exclusive_max(float x); -float __ovld work_group_scan_inclusive_add(float x); -float __ovld work_group_scan_inclusive_min(float x); -float __ovld work_group_scan_inclusive_max(float x); +int __ovld __conv work_group_reduce_add(int x); +int __ovld __conv work_group_reduce_min(int x); +int __ovld __conv work_group_reduce_max(int x); +int __ovld __conv work_group_scan_exclusive_add(int x); +int __ovld __conv work_group_scan_exclusive_min(int x); +int __ovld __conv work_group_scan_exclusive_max(int x); +int __ovld __conv work_group_scan_inclusive_add(int x); +int __ovld __conv work_group_scan_inclusive_min(int x); +int __ovld __conv work_group_scan_inclusive_max(int x); +uint __ovld __conv work_group_reduce_add(uint x); +uint __ovld __conv work_group_reduce_min(uint x); +uint __ovld __conv work_group_reduce_max(uint x); +uint __ovld __conv work_group_scan_exclusive_add(uint x); +uint __ovld __conv work_group_scan_exclusive_min(uint x); +uint __ovld __conv work_group_scan_exclusive_max(uint x); +uint __ovld __conv work_group_scan_inclusive_add(uint x); +uint __ovld __conv work_group_scan_inclusive_min(uint x); +uint __ovld __conv work_group_scan_inclusive_max(uint x); +long __ovld __conv work_group_reduce_add(long x); +long __ovld __conv work_group_reduce_min(long x); +long __ovld __conv work_group_reduce_max(long x); +long __ovld __conv work_group_scan_exclusive_add(long x); +long __ovld __conv work_group_scan_exclusive_min(long x); +long __ovld __conv work_group_scan_exclusive_max(long x); +long __ovld __conv work_group_scan_inclusive_add(long x); +long __ovld __conv work_group_scan_inclusive_min(long x); +long __ovld __conv work_group_scan_inclusive_max(long x); +ulong __ovld __conv work_group_reduce_add(ulong x); +ulong __ovld __conv work_group_reduce_min(ulong x); +ulong __ovld __conv work_group_reduce_max(ulong x); +ulong __ovld __conv work_group_scan_exclusive_add(ulong x); +ulong __ovld __conv work_group_scan_exclusive_min(ulong x); +ulong __ovld __conv work_group_scan_exclusive_max(ulong x); +ulong __ovld __conv work_group_scan_inclusive_add(ulong x); +ulong __ovld __conv work_group_scan_inclusive_min(ulong x); +ulong __ovld __conv work_group_scan_inclusive_max(ulong x); +float __ovld __conv work_group_reduce_add(float x); +float __ovld __conv work_group_reduce_min(float x); +float __ovld __conv work_group_reduce_max(float x); +float __ovld __conv work_group_scan_exclusive_add(float x); +float __ovld __conv work_group_scan_exclusive_min(float x); +float __ovld __conv work_group_scan_exclusive_max(float x); +float __ovld __conv work_group_scan_inclusive_add(float x); +float __ovld __conv work_group_scan_inclusive_min(float x); +float __ovld __conv work_group_scan_inclusive_max(float x); #ifdef cl_khr_fp64 -double __ovld work_group_reduce_add(double x); -double __ovld work_group_reduce_min(double x); -double __ovld work_group_reduce_max(double x); -double __ovld work_group_scan_exclusive_add(double x); -double __ovld work_group_scan_exclusive_min(double x); -double __ovld work_group_scan_exclusive_max(double x); -double __ovld work_group_scan_inclusive_add(double x); -double __ovld work_group_scan_inclusive_min(double x); -double __ovld work_group_scan_inclusive_max(double x); +double __ovld __conv work_group_reduce_add(double x); +double __ovld __conv work_group_reduce_min(double x); +double __ovld __conv work_group_reduce_max(double x); +double __ovld __conv work_group_scan_exclusive_add(double x); +double __ovld __conv work_group_scan_exclusive_min(double x); +double __ovld __conv work_group_scan_exclusive_max(double x); +double __ovld __conv work_group_scan_inclusive_add(double x); +double __ovld __conv work_group_scan_inclusive_min(double x); +double __ovld __conv work_group_scan_inclusive_max(double x); #endif //cl_khr_fp64 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -16840,11 +16743,11 @@ void __ovld retain_event(clk_event_t); void __ovld release_event(clk_event_t); -clk_event_t create_user_event(void); +clk_event_t __ovld create_user_event(void); void __ovld set_user_event_status(clk_event_t e, int state); -bool is_valid_event (clk_event_t event); +bool __ovld is_valid_event (clk_event_t event); void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value); @@ -16864,96 +16767,286 @@ uint __ovld get_enqueued_num_sub_groups(void); uint __ovld get_sub_group_id(void); uint __ovld get_sub_group_local_id(void); -void __ovld sub_group_barrier(cl_mem_fence_flags flags); +void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags); #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -void __ovld sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope); +void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope); #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 -int __ovld sub_group_all(int predicate); -int __ovld sub_group_any(int predicate); - -int __ovld sub_group_broadcast(int x, uint sub_group_local_id); -uint __ovld sub_group_broadcast(uint x, uint sub_group_local_id); -long __ovld sub_group_broadcast(long x, uint sub_group_local_id); -ulong __ovld sub_group_broadcast(ulong x, uint sub_group_local_id); -float __ovld sub_group_broadcast(float x, uint sub_group_local_id); - -int __ovld sub_group_reduce_add(int x); -uint __ovld sub_group_reduce_add(uint x); -long __ovld sub_group_reduce_add(long x); -ulong __ovld sub_group_reduce_add(ulong x); -float __ovld sub_group_reduce_add(float x); -int __ovld sub_group_reduce_min(int x); -uint __ovld sub_group_reduce_min(uint x); -long __ovld sub_group_reduce_min(long x); -ulong __ovld sub_group_reduce_min(ulong x); -float __ovld sub_group_reduce_min(float x); -int __ovld sub_group_reduce_max(int x); -uint __ovld sub_group_reduce_max(uint x); -long __ovld sub_group_reduce_max(long x); -ulong __ovld sub_group_reduce_max(ulong x); -float __ovld sub_group_reduce_max(float x); - -int __ovld sub_group_scan_exclusive_add(int x); -uint __ovld sub_group_scan_exclusive_add(uint x); -long __ovld sub_group_scan_exclusive_add(long x); -ulong __ovld sub_group_scan_exclusive_add(ulong x); -float __ovld sub_group_scan_exclusive_add(float x); -int __ovld sub_group_scan_exclusive_min(int x); -uint __ovld sub_group_scan_exclusive_min(uint x); -long __ovld sub_group_scan_exclusive_min(long x); -ulong __ovld sub_group_scan_exclusive_min(ulong x); -float __ovld sub_group_scan_exclusive_min(float x); -int __ovld sub_group_scan_exclusive_max(int x); -uint __ovld sub_group_scan_exclusive_max(uint x); -long __ovld sub_group_scan_exclusive_max(long x); -ulong __ovld sub_group_scan_exclusive_max(ulong x); -float __ovld sub_group_scan_exclusive_max(float x); - -int __ovld sub_group_scan_inclusive_add(int x); -uint __ovld sub_group_scan_inclusive_add(uint x); -long __ovld sub_group_scan_inclusive_add(long x); -ulong __ovld sub_group_scan_inclusive_add(ulong x); -float __ovld sub_group_scan_inclusive_add(float x); -int __ovld sub_group_scan_inclusive_min(int x); -uint __ovld sub_group_scan_inclusive_min(uint x); -long __ovld sub_group_scan_inclusive_min(long x); -ulong __ovld sub_group_scan_inclusive_min(ulong x); -float __ovld sub_group_scan_inclusive_min(float x); -int __ovld sub_group_scan_inclusive_max(int x); -uint __ovld sub_group_scan_inclusive_max(uint x); -long __ovld sub_group_scan_inclusive_max(long x); -ulong __ovld sub_group_scan_inclusive_max(ulong x); -float __ovld sub_group_scan_inclusive_max(float x); +int __ovld __conv sub_group_all(int predicate); +int __ovld __conv sub_group_any(int predicate); + +int __ovld __conv sub_group_broadcast(int x, uint sub_group_local_id); +uint __ovld __conv sub_group_broadcast(uint x, uint sub_group_local_id); +long __ovld __conv sub_group_broadcast(long x, uint sub_group_local_id); +ulong __ovld __conv sub_group_broadcast(ulong x, uint sub_group_local_id); +float __ovld __conv sub_group_broadcast(float x, uint sub_group_local_id); + +int __ovld __conv sub_group_reduce_add(int x); +uint __ovld __conv sub_group_reduce_add(uint x); +long __ovld __conv sub_group_reduce_add(long x); +ulong __ovld __conv sub_group_reduce_add(ulong x); +float __ovld __conv sub_group_reduce_add(float x); +int __ovld __conv sub_group_reduce_min(int x); +uint __ovld __conv sub_group_reduce_min(uint x); +long __ovld __conv sub_group_reduce_min(long x); +ulong __ovld __conv sub_group_reduce_min(ulong x); +float __ovld __conv sub_group_reduce_min(float x); +int __ovld __conv sub_group_reduce_max(int x); +uint __ovld __conv sub_group_reduce_max(uint x); +long __ovld __conv sub_group_reduce_max(long x); +ulong __ovld __conv sub_group_reduce_max(ulong x); +float __ovld __conv sub_group_reduce_max(float x); + +int __ovld __conv sub_group_scan_exclusive_add(int x); +uint __ovld __conv sub_group_scan_exclusive_add(uint x); +long __ovld __conv sub_group_scan_exclusive_add(long x); +ulong __ovld __conv sub_group_scan_exclusive_add(ulong x); +float __ovld __conv sub_group_scan_exclusive_add(float x); +int __ovld __conv sub_group_scan_exclusive_min(int x); +uint __ovld __conv sub_group_scan_exclusive_min(uint x); +long __ovld __conv sub_group_scan_exclusive_min(long x); +ulong __ovld __conv sub_group_scan_exclusive_min(ulong x); +float __ovld __conv sub_group_scan_exclusive_min(float x); +int __ovld __conv sub_group_scan_exclusive_max(int x); +uint __ovld __conv sub_group_scan_exclusive_max(uint x); +long __ovld __conv sub_group_scan_exclusive_max(long x); +ulong __ovld __conv sub_group_scan_exclusive_max(ulong x); +float __ovld __conv sub_group_scan_exclusive_max(float x); + +int __ovld __conv sub_group_scan_inclusive_add(int x); +uint __ovld __conv sub_group_scan_inclusive_add(uint x); +long __ovld __conv sub_group_scan_inclusive_add(long x); +ulong __ovld __conv sub_group_scan_inclusive_add(ulong x); +float __ovld __conv sub_group_scan_inclusive_add(float x); +int __ovld __conv sub_group_scan_inclusive_min(int x); +uint __ovld __conv sub_group_scan_inclusive_min(uint x); +long __ovld __conv sub_group_scan_inclusive_min(long x); +ulong __ovld __conv sub_group_scan_inclusive_min(ulong x); +float __ovld __conv sub_group_scan_inclusive_min(float x); +int __ovld __conv sub_group_scan_inclusive_max(int x); +uint __ovld __conv sub_group_scan_inclusive_max(uint x); +long __ovld __conv sub_group_scan_inclusive_max(long x); +ulong __ovld __conv sub_group_scan_inclusive_max(ulong x); +float __ovld __conv sub_group_scan_inclusive_max(float x); #ifdef cl_khr_fp16 -half __ovld sub_group_broadcast(half x, uint sub_group_local_id); -half __ovld sub_group_reduce_add(half x); -half __ovld sub_group_reduce_min(half x); -half __ovld sub_group_reduce_max(half x); -half __ovld sub_group_scan_exclusive_add(half x); -half __ovld sub_group_scan_exclusive_min(half x); -half __ovld sub_group_scan_exclusive_max(half x); -half __ovld sub_group_scan_inclusive_add(half x); -half __ovld sub_group_scan_inclusive_min(half x); -half __ovld sub_group_scan_inclusive_max(half x); +half __ovld __conv sub_group_broadcast(half x, uint sub_group_local_id); +half __ovld __conv sub_group_reduce_add(half x); +half __ovld __conv sub_group_reduce_min(half x); +half __ovld __conv sub_group_reduce_max(half x); +half __ovld __conv sub_group_scan_exclusive_add(half x); +half __ovld __conv sub_group_scan_exclusive_min(half x); +half __ovld __conv sub_group_scan_exclusive_max(half x); +half __ovld __conv sub_group_scan_inclusive_add(half x); +half __ovld __conv sub_group_scan_inclusive_min(half x); +half __ovld __conv sub_group_scan_inclusive_max(half x); #endif //cl_khr_fp16 #ifdef cl_khr_fp64 -double __ovld sub_group_broadcast(double x, uint sub_group_local_id); -double __ovld sub_group_reduce_add(double x); -double __ovld sub_group_reduce_min(double x); -double __ovld sub_group_reduce_max(double x); -double __ovld sub_group_scan_exclusive_add(double x); -double __ovld sub_group_scan_exclusive_min(double x); -double __ovld sub_group_scan_exclusive_max(double x); -double __ovld sub_group_scan_inclusive_add(double x); -double __ovld sub_group_scan_inclusive_min(double x); -double __ovld sub_group_scan_inclusive_max(double x); +double __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id); +double __ovld __conv sub_group_reduce_add(double x); +double __ovld __conv sub_group_reduce_min(double x); +double __ovld __conv sub_group_reduce_max(double x); +double __ovld __conv sub_group_scan_exclusive_add(double x); +double __ovld __conv sub_group_scan_exclusive_min(double x); +double __ovld __conv sub_group_scan_exclusive_max(double x); +double __ovld __conv sub_group_scan_inclusive_add(double x); +double __ovld __conv sub_group_scan_inclusive_min(double x); +double __ovld __conv sub_group_scan_inclusive_max(double x); #endif //cl_khr_fp64 #endif //cl_khr_subgroups cl_intel_subgroups +#ifdef cl_amd_media_ops +uint __ovld amd_bitalign(uint a, uint b, uint c); +uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c); +uint3 __ovld amd_bitalign(uint3 a, uint3 b, uint3 c); +uint4 __ovld amd_bitalign(uint4 a, uint4 b, uint4 c); +uint8 __ovld amd_bitalign(uint8 a, uint8 b, uint8 c); +uint16 __ovld amd_bitalign(uint16 a, uint16 b, uint16 c); + +uint __ovld amd_bytealign(uint a, uint b, uint c); +uint2 __ovld amd_bytealign(uint2 a, uint2 b, uint2 c); +uint3 __ovld amd_bytealign(uint3 a, uint3 b, uint3 c); +uint4 __ovld amd_bytealign(uint4 a, uint4 b, uint4 c); +uint8 __ovld amd_bytealign(uint8 a, uint8 b, uint8 c); +uint16 __ovld amd_bytealign(uint16 a, uint16 b, uint16 c); + +uint __ovld amd_lerp(uint a, uint b, uint c); +uint2 __ovld amd_lerp(uint2 a, uint2 b, uint2 c); +uint3 __ovld amd_lerp(uint3 a, uint3 b, uint3 c); +uint4 __ovld amd_lerp(uint4 a, uint4 b, uint4 c); +uint8 __ovld amd_lerp(uint8 a, uint8 b, uint8 c); +uint16 __ovld amd_lerp(uint16 a, uint16 b, uint16 c); + +uint __ovld amd_pack(float4 v); + +uint __ovld amd_sad4(uint4 x, uint4 y, uint z); + +uint __ovld amd_sadhi(uint a, uint b, uint c); +uint2 __ovld amd_sadhi(uint2 a, uint2 b, uint2 c); +uint3 __ovld amd_sadhi(uint3 a, uint3 b, uint3 c); +uint4 __ovld amd_sadhi(uint4 a, uint4 b, uint4 c); +uint8 __ovld amd_sadhi(uint8 a, uint8 b, uint8 c); +uint16 __ovld amd_sadhi(uint16 a, uint16 b, uint16 c); + +uint __ovld amd_sad(uint a, uint b, uint c); +uint2 __ovld amd_sad(uint2 a, uint2 b, uint2 c); +uint3 __ovld amd_sad(uint3 a, uint3 b, uint3 c); +uint4 __ovld amd_sad(uint4 a, uint4 b, uint4 c); +uint8 __ovld amd_sad(uint8 a, uint8 b, uint8 c); +uint16 __ovld amd_sad(uint16 a, uint16 b, uint16 c); + +float __ovld amd_unpack0(uint a); +float2 __ovld amd_unpack0(uint2 a); +float3 __ovld amd_unpack0(uint3 a); +float4 __ovld amd_unpack0(uint4 a); +float8 __ovld amd_unpack0(uint8 a); +float16 __ovld amd_unpack0(uint16 a); + +float __ovld amd_unpack1(uint a); +float2 __ovld amd_unpack1(uint2 a); +float3 __ovld amd_unpack1(uint3 a); +float4 __ovld amd_unpack1(uint4 a); +float8 __ovld amd_unpack1(uint8 a); +float16 __ovld amd_unpack1(uint16 a); + +float __ovld amd_unpack2(uint a); +float2 __ovld amd_unpack2(uint2 a); +float3 __ovld amd_unpack2(uint3 a); +float4 __ovld amd_unpack2(uint4 a); +float8 __ovld amd_unpack2(uint8 a); +float16 __ovld amd_unpack2(uint16 a); + +float __ovld amd_unpack3(uint a); +float2 __ovld amd_unpack3(uint2 a); +float3 __ovld amd_unpack3(uint3 a); +float4 __ovld amd_unpack3(uint4 a); +float8 __ovld amd_unpack3(uint8 a); +float16 __ovld amd_unpack3(uint16 a); +#endif // cl_amd_media_ops + +#ifdef cl_amd_media_ops2 +int __ovld amd_bfe(int src0, uint src1, uint src2); +int2 __ovld amd_bfe(int2 src0, uint2 src1, uint2 src2); +int3 __ovld amd_bfe(int3 src0, uint3 src1, uint3 src2); +int4 __ovld amd_bfe(int4 src0, uint4 src1, uint4 src2); +int8 __ovld amd_bfe(int8 src0, uint8 src1, uint8 src2); +int16 __ovld amd_bfe(int16 src0, uint16 src1, uint16 src2); + +uint __ovld amd_bfe(uint src0, uint src1, uint src2); +uint2 __ovld amd_bfe(uint2 src0, uint2 src1, uint2 src2); +uint3 __ovld amd_bfe(uint3 src0, uint3 src1, uint3 src2); +uint4 __ovld amd_bfe(uint4 src0, uint4 src1, uint4 src2); +uint8 __ovld amd_bfe(uint8 src0, uint8 src1, uint8 src2); +uint16 __ovld amd_bfe(uint16 src0, uint16 src1, uint16 src2); + +uint __ovld amd_bfm(uint src0, uint src1); +uint2 __ovld amd_bfm(uint2 src0, uint2 src1); +uint3 __ovld amd_bfm(uint3 src0, uint3 src1); +uint4 __ovld amd_bfm(uint4 src0, uint4 src1); +uint8 __ovld amd_bfm(uint8 src0, uint8 src1); +uint16 __ovld amd_bfm(uint16 src0, uint16 src1); + +float __ovld amd_max3(float src0, float src1, float src2); +float2 __ovld amd_max3(float2 src0, float2 src1, float2 src2); +float3 __ovld amd_max3(float3 src0, float3 src1, float3 src2); +float4 __ovld amd_max3(float4 src0, float4 src1, float4 src2); +float8 __ovld amd_max3(float8 src0, float8 src1, float8 src2); +float16 __ovld amd_max3(float16 src0, float16 src1, float16 src2); + +int __ovld amd_max3(int src0, int src1, int src2); +int2 __ovld amd_max3(int2 src0, int2 src1, int2 src2); +int3 __ovld amd_max3(int3 src0, int3 src1, int3 src2); +int4 __ovld amd_max3(int4 src0, int4 src1, int4 src2); +int8 __ovld amd_max3(int8 src0, int8 src1, int8 src2); +int16 __ovld amd_max3(int16 src0, int16 src1, int16 src2); + +uint __ovld amd_max3(uint src0, uint src1, uint src2); +uint2 __ovld amd_max3(uint2 src0, uint2 src1, uint2 src2); +uint3 __ovld amd_max3(uint3 src0, uint3 src1, uint3 src2); +uint4 __ovld amd_max3(uint4 src0, uint4 src1, uint4 src2); +uint8 __ovld amd_max3(uint8 src0, uint8 src1, uint8 src2); +uint16 __ovld amd_max3(uint16 src0, uint16 src1, uint16 src2); + +float __ovld amd_median3(float src0, float src1, float src2); +float2 __ovld amd_median3(float2 src0, float2 src1, float2 src2); +float3 __ovld amd_median3(float3 src0, float3 src1, float3 src2); +float4 __ovld amd_median3(float4 src0, float4 src1, float4 src2); +float8 __ovld amd_median3(float8 src0, float8 src1, float8 src2); +float16 __ovld amd_median3(float16 src0, float16 src1, float16 src2); + +int __ovld amd_median3(int src0, int src1, int src2); +int2 __ovld amd_median3(int2 src0, int2 src1, int2 src2); +int3 __ovld amd_median3(int3 src0, int3 src1, int3 src2); +int4 __ovld amd_median3(int4 src0, int4 src1, int4 src2); +int8 __ovld amd_median3(int8 src0, int8 src1, int8 src2); +int16 __ovld amd_median3(int16 src0, int16 src1, int16 src2); + +uint __ovld amd_median3(uint src0, uint src1, uint src2); +uint2 __ovld amd_median3(uint2 src0, uint2 src1, uint2 src2); +uint3 __ovld amd_median3(uint3 src0, uint3 src1, uint3 src2); +uint4 __ovld amd_median3(uint4 src0, uint4 src1, uint4 src2); +uint8 __ovld amd_median3(uint8 src0, uint8 src1, uint8 src2); +uint16 __ovld amd_median3(uint16 src0, uint16 src1, uint16 src2); + +float __ovld amd_min3(float src0, float src1, float src); +float2 __ovld amd_min3(float2 src0, float2 src1, float2 src); +float3 __ovld amd_min3(float3 src0, float3 src1, float3 src); +float4 __ovld amd_min3(float4 src0, float4 src1, float4 src); +float8 __ovld amd_min3(float8 src0, float8 src1, float8 src); +float16 __ovld amd_min3(float16 src0, float16 src1, float16 src); + +int __ovld amd_min3(int src0, int src1, int src2); +int2 __ovld amd_min3(int2 src0, int2 src1, int2 src2); +int3 __ovld amd_min3(int3 src0, int3 src1, int3 src2); +int4 __ovld amd_min3(int4 src0, int4 src1, int4 src2); +int8 __ovld amd_min3(int8 src0, int8 src1, int8 src2); +int16 __ovld amd_min3(int16 src0, int16 src1, int16 src2); + +uint __ovld amd_min3(uint src0, uint src1, uint src2); +uint2 __ovld amd_min3(uint2 src0, uint2 src1, uint2 src2); +uint3 __ovld amd_min3(uint3 src0, uint3 src1, uint3 src2); +uint4 __ovld amd_min3(uint4 src0, uint4 src1, uint4 src2); +uint8 __ovld amd_min3(uint8 src0, uint8 src1, uint8 src2); +uint16 __ovld amd_min3(uint16 src0, uint16 src1, uint16 src2); + +ulong __ovld amd_mqsad(ulong src0, uint src1, ulong src2); +ulong2 __ovld amd_mqsad(ulong2 src0, uint2 src1, ulong2 src2); +ulong3 __ovld amd_mqsad(ulong3 src0, uint3 src1, ulong3 src2); +ulong4 __ovld amd_mqsad(ulong4 src0, uint4 src1, ulong4 src2); +ulong8 __ovld amd_mqsad(ulong8 src0, uint8 src1, ulong8 src2); +ulong16 __ovld amd_mqsad(ulong16 src0, uint16 src1, ulong16 src2); + +ulong __ovld amd_qsad(ulong src0, uint src1, ulong src2); +ulong2 __ovld amd_qsad(ulong2 src0, uint2 src1, ulong2 src2); +ulong3 __ovld amd_qsad(ulong3 src0, uint3 src1, ulong3 src2); +ulong4 __ovld amd_qsad(ulong4 src0, uint4 src1, ulong4 src2); +ulong8 __ovld amd_qsad(ulong8 src0, uint8 src1, ulong8 src2); +ulong16 __ovld amd_qsad(ulong16 src0, uint16 src1, ulong16 src2); + +uint __ovld amd_msad(uint src0, uint src1, uint src2); +uint2 __ovld amd_msad(uint2 src0, uint2 src1, uint2 src2); +uint3 __ovld amd_msad(uint3 src0, uint3 src1, uint3 src2); +uint4 __ovld amd_msad(uint4 src0, uint4 src1, uint4 src2); +uint8 __ovld amd_msad(uint8 src0, uint8 src1, uint8 src2); +uint16 __ovld amd_msad(uint16 src0, uint16 src1, uint16 src2); + +uint __ovld amd_sadd(uint src0, uint src1, uint src2); +uint2 __ovld amd_sadd(uint2 src0, uint2 src1, uint2 src2); +uint3 __ovld amd_sadd(uint3 src0, uint3 src1, uint3 src2); +uint4 __ovld amd_sadd(uint4 src0, uint4 src1, uint4 src2); +uint8 __ovld amd_sadd(uint8 src0, uint8 src1, uint8 src2); +uint16 __ovld amd_sadd(uint16 src0, uint16 src1, uint16 src2); + +uint __ovld amd_sadw(uint src0, uint src1, uint src2); +uint2 __ovld amd_sadw(uint2 src0, uint2 src1, uint2 src2); +uint3 __ovld amd_sadw(uint3 src0, uint3 src1, uint3 src2); +uint4 __ovld amd_sadw(uint4 src0, uint4 src1, uint4 src2); +uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2); +uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2); +#endif // cl_amd_media_ops2 + // Disable any extensions we may have enabled previously. #pragma OPENCL EXTENSION all : disable diff --git a/lib/Headers/pmmintrin.h b/lib/Headers/pmmintrin.h index 5b1058069c44..d4f6487af179 100644 --- a/lib/Headers/pmmintrin.h +++ b/lib/Headers/pmmintrin.h @@ -37,7 +37,7 @@ /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VLDDQU instruction. +/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. /// /// \param __p /// A pointer to a 128-bit integer vector containing integer values. @@ -53,7 +53,7 @@ _mm_lddqu_si128(__m128i const *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VADDSUBPS instruction. +/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float] containing the left source operand. @@ -72,7 +72,7 @@ _mm_addsub_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VHADDPS instruction. +/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the source operands. @@ -95,7 +95,7 @@ _mm_hadd_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VHSUBPS instruction. +/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the source operands. @@ -115,18 +115,18 @@ _mm_hsub_ps(__m128 __a, __m128 __b) /// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit /// vector of [4 x float] to float values stored in a 128-bit vector of -/// [4 x float]. -/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of -/// the destination. -/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the -/// destination. +/// [4 x float]. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVSHDUP instruction. +/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. /// /// \param __a -/// A 128-bit vector of [4 x float]. +/// A 128-bit vector of [4 x float]. \n +/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of +/// the destination. \n +/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the +/// destination. /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated /// values. static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -135,20 +135,19 @@ _mm_movehdup_ps(__m128 __a) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); } -/// \brief Duplicates low-order (even-indexed) values from a 128-bit -/// vector of [4 x float] to float values stored in a 128-bit vector of -/// [4 x float]. -/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of -/// the destination. -/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the -/// destination. +/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of +/// [4 x float] to float values stored in a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVSLDUP instruction. +/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. /// /// \param __a -/// A 128-bit vector of [4 x float]. +/// A 128-bit vector of [4 x float] \n +/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of +/// the destination. \n +/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the +/// destination. /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated /// values. static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -162,7 +161,7 @@ _mm_moveldup_ps(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VADDSUBPD instruction. +/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. /// /// \param __a /// A 128-bit vector of [2 x double] containing the left source operand. @@ -181,7 +180,7 @@ _mm_addsub_pd(__m128d __a, __m128d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VHADDPD instruction. +/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. /// /// \param __a /// A 128-bit vector of [2 x double] containing one of the source operands. @@ -204,7 +203,7 @@ _mm_hadd_pd(__m128d __a, __m128d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VHSUBPD instruction. +/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. /// /// \param __a /// A 128-bit vector of [2 x double] containing one of the source operands. @@ -231,7 +230,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b) /// __m128d _mm_loaddup_pd(double const * dp); /// \endcode /// -/// This intrinsic corresponds to the \c VMOVDDUP instruction. +/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. /// /// \param dp /// A pointer to a double-precision value to be moved and duplicated. @@ -245,7 +244,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVDDUP instruction. +/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. /// /// \param __a /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits @@ -272,7 +271,7 @@ _mm_movedup_pd(__m128d __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c MONITOR instruction. +/// This intrinsic corresponds to the <c> MONITOR </c> instruction. /// /// \param __p /// The memory range to be monitored. The size of the range is determined by @@ -293,7 +292,7 @@ _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c MWAIT instruction. +/// This intrinsic corresponds to the <c> MWAIT </c> instruction. /// /// \param __extensions /// Optional extensions for the monitoring state, which may vary by diff --git a/lib/Headers/popcntintrin.h b/lib/Headers/popcntintrin.h index 7e2f1670805f..0b4793e58bcb 100644 --- a/lib/Headers/popcntintrin.h +++ b/lib/Headers/popcntintrin.h @@ -31,7 +31,7 @@ /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c POPCNT instruction. +/// This intrinsic corresponds to the <c> POPCNT </c> instruction. /// /// \param __A /// An unsigned 32-bit integer operand. @@ -47,7 +47,7 @@ _mm_popcnt_u32(unsigned int __A) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c POPCNT instruction. +/// This intrinsic corresponds to the <c> POPCNT </c> instruction. /// /// \param __A /// A signed 32-bit integer operand. @@ -64,7 +64,7 @@ _popcnt32(int __A) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c POPCNT instruction. +/// This intrinsic corresponds to the <c> POPCNT </c> instruction. /// /// \param __A /// An unsigned 64-bit integer operand. @@ -80,7 +80,7 @@ _mm_popcnt_u64(unsigned long long __A) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c POPCNT instruction. +/// This intrinsic corresponds to the <c> POPCNT </c> instruction. /// /// \param __A /// A signed 64-bit integer operand. diff --git a/lib/Headers/stdatomic.h b/lib/Headers/stdatomic.h index e03798766014..23bb3a357768 100644 --- a/lib/Headers/stdatomic.h +++ b/lib/Headers/stdatomic.h @@ -45,11 +45,11 @@ extern "C" { #define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE #define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE #define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE -#define ATOMIC_SHORT_T_LOCK_FREE __GCC_ATOMIC_SHORT_T_LOCK_FREE -#define ATOMIC_INT_T_LOCK_FREE __GCC_ATOMIC_INT_T_LOCK_FREE -#define ATOMIC_LONG_T_LOCK_FREE __GCC_ATOMIC_LONG_T_LOCK_FREE -#define ATOMIC_LLONG_T_LOCK_FREE __GCC_ATOMIC_LLONG_T_LOCK_FREE -#define ATOMIC_POINTER_T_LOCK_FREE __GCC_ATOMIC_POINTER_T_LOCK_FREE +#define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE +#define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE +#define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE +#define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE +#define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE /* 7.17.2 Initialization */ diff --git a/lib/Headers/tmmintrin.h b/lib/Headers/tmmintrin.h index a72796ba4a68..80664043a06f 100644 --- a/lib/Headers/tmmintrin.h +++ b/lib/Headers/tmmintrin.h @@ -483,15 +483,15 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b) /// \param __b /// A 128-bit integer vector containing the second source operand. /// \returns A 128-bit integer vector containing the sums of products of both -/// operands: -/// R0 := (__a0 * __b0) + (__a1 * __b1) -/// R1 := (__a2 * __b2) + (__a3 * __b3) -/// R2 := (__a4 * __b4) + (__a5 * __b5) -/// R3 := (__a6 * __b6) + (__a7 * __b7) -/// R4 := (__a8 * __b8) + (__a9 * __b9) -/// R5 := (__a10 * __b10) + (__a11 * __b11) -/// R6 := (__a12 * __b12) + (__a13 * __b13) -/// R7 := (__a14 * __b14) + (__a15 * __b15) +/// operands: \n +/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n +/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n +/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n +/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n +/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n +/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n +/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n +/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maddubs_epi16(__m128i __a, __m128i __b) { @@ -516,11 +516,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// \param __b /// A 64-bit integer vector containing the second source operand. /// \returns A 64-bit integer vector containing the sums of products of both -/// operands: -/// R0 := (__a0 * __b0) + (__a1 * __b1) -/// R1 := (__a2 * __b2) + (__a3 * __b3) -/// R2 := (__a4 * __b4) + (__a5 * __b5) -/// R3 := (__a6 * __b6) + (__a7 * __b7) +/// operands: \n +/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n +/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n +/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n +/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_maddubs_pi16(__m64 __a, __m64 __b) { @@ -580,11 +580,11 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b) /// \param __b /// A 128-bit integer vector containing control bytes corresponding to /// positions in the destination: -/// Bit 7: -/// 1: Clear the corresponding byte in the destination. +/// Bit 7: \n +/// 1: Clear the corresponding byte in the destination. \n /// 0: Copy the selected source byte to the corresponding byte in the -/// destination. -/// Bits [6:4] Reserved. +/// destination. \n +/// Bits [6:4] Reserved. \n /// Bits [3:0] select the source byte to be copied. /// \returns A 128-bit integer vector containing the copied or cleared values. static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -606,10 +606,10 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b) /// \param __b /// A 64-bit integer vector containing control bytes corresponding to /// positions in the destination: -/// Bit 7: -/// 1: Clear the corresponding byte in the destination. +/// Bit 7: \n +/// 1: Clear the corresponding byte in the destination. \n /// 0: Copy the selected source byte to the corresponding byte in the -/// destination. +/// destination. \n /// Bits [3:0] select the source byte to be copied. /// \returns A 64-bit integer vector containing the copied or cleared values. static __inline__ __m64 __DEFAULT_FN_ATTRS diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h index 99cddb0fac82..dc31b85cfd7c 100644 --- a/lib/Headers/xmmintrin.h +++ b/lib/Headers/xmmintrin.h @@ -46,7 +46,7 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16))); /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions. +/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the source operands. @@ -69,7 +69,7 @@ _mm_add_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions. +/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the source operands. @@ -88,7 +88,7 @@ _mm_add_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions. +/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits @@ -112,7 +112,7 @@ _mm_sub_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions. +/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing the minuend. @@ -131,7 +131,7 @@ _mm_sub_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMULSS / MULSS instructions. +/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the source operands. @@ -154,7 +154,7 @@ _mm_mul_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMULPS / MULPS instructions. +/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the source operands. @@ -173,7 +173,7 @@ _mm_mul_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions. +/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing the dividend. The lower 32 @@ -195,7 +195,7 @@ _mm_div_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions. +/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing the dividend. @@ -214,7 +214,7 @@ _mm_div_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions. +/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -233,7 +233,7 @@ _mm_sqrt_ss(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions. +/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -250,7 +250,7 @@ _mm_sqrt_ps(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions. +/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -269,7 +269,7 @@ _mm_rcp_ss(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions. +/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -286,7 +286,7 @@ _mm_rcp_ps(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions. +/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -306,7 +306,7 @@ _mm_rsqrt_ss(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions. +/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -324,7 +324,7 @@ _mm_rsqrt_ps(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMINSS / MINSS instructions. +/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -341,12 +341,12 @@ _mm_min_ss(__m128 __a, __m128 __b) return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 128-bit vectors of [4 x float] and returns the -/// lesser of each pair of values. +/// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser +/// of each pair of values. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMINPS / MINPS instructions. +/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. @@ -361,12 +361,12 @@ _mm_min_ps(__m128 __a, __m128 __b) } /// \brief Compares two 32-bit float values in the low-order bits of both -/// operands and returns the greater value in the low-order bits of -/// a vector [4 x float]. +/// operands and returns the greater value in the low-order bits of a 128-bit +/// vector of [4 x float]. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions. +/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -388,7 +388,7 @@ _mm_max_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions. +/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. @@ -406,7 +406,7 @@ _mm_max_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions. +/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions. /// /// \param __a /// A 128-bit vector containing one of the source operands. @@ -426,7 +426,7 @@ _mm_and_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions. +/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing the first source operand. The @@ -446,7 +446,7 @@ _mm_andnot_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VORPS / ORPS instructions. +/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the source operands. @@ -465,7 +465,7 @@ _mm_or_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VXORPS / XORPS instructions. +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the source operands. @@ -485,7 +485,7 @@ _mm_xor_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions. +/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -506,7 +506,7 @@ _mm_cmpeq_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions. +/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -526,7 +526,7 @@ _mm_cmpeq_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions. +/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -548,7 +548,7 @@ _mm_cmplt_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions. +/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -569,7 +569,7 @@ _mm_cmplt_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions. +/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -591,7 +591,7 @@ _mm_cmple_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions. +/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -611,7 +611,7 @@ _mm_cmple_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions. +/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -635,7 +635,7 @@ _mm_cmpgt_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions. +/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -656,7 +656,7 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions. +/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -680,7 +680,7 @@ _mm_cmpge_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions. +/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -699,7 +699,8 @@ _mm_cmpge_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions. +/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -720,7 +721,8 @@ _mm_cmpneq_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions. +/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -740,7 +742,8 @@ _mm_cmpneq_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions. +/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -762,7 +765,8 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions. +/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -783,7 +787,8 @@ _mm_cmpnlt_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions. +/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -805,7 +810,8 @@ _mm_cmpnle_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions. +/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -826,7 +832,8 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions. +/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -850,7 +857,8 @@ _mm_cmpngt_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions. +/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -871,7 +879,8 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions. +/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -895,7 +904,8 @@ _mm_cmpnge_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions. +/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -916,7 +926,8 @@ _mm_cmpnge_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions. +/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -938,7 +949,8 @@ _mm_cmpord_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions. +/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -959,7 +971,8 @@ _mm_cmpord_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions. +/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float] containing one of the operands. The lower @@ -981,7 +994,8 @@ _mm_cmpunord_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions. +/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -999,7 +1013,8 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1020,7 +1035,8 @@ _mm_comieq_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1041,7 +1057,7 @@ _mm_comilt_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1062,7 +1078,7 @@ _mm_comile_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1083,7 +1099,7 @@ _mm_comigt_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1104,7 +1120,7 @@ _mm_comige_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1125,7 +1141,7 @@ _mm_comineq_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1146,7 +1162,7 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1162,13 +1178,13 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b) } /// \brief Performs an unordered comparison of two 32-bit float values using -/// the low-order bits of both operands to determine if the first operand -/// is less than or equal to the second operand and returns the result of -/// the comparison. +/// the low-order bits of both operands to determine if the first operand is +/// less than or equal to the second operand and returns the result of the +/// comparison. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1184,13 +1200,13 @@ _mm_ucomile_ss(__m128 __a, __m128 __b) } /// \brief Performs an unordered comparison of two 32-bit float values using -/// the low-order bits of both operands to determine if the first operand -/// is greater than the second operand and returns the result of the +/// the low-order bits of both operands to determine if the first operand is +/// greater than the second operand and returns the result of the /// comparison. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1212,7 +1228,7 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1233,7 +1249,7 @@ _mm_ucomige_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1253,7 +1269,8 @@ _mm_ucomineq_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. +/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1270,7 +1287,8 @@ _mm_cvtss_si32(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. +/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1289,7 +1307,8 @@ _mm_cvt_ss2si(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. +/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1308,7 +1327,7 @@ _mm_cvtss_si64(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPS2PI instruction. +/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1324,7 +1343,7 @@ _mm_cvtps_pi32(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPS2PI instruction. +/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1341,7 +1360,8 @@ _mm_cvt_ps2pi(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. +/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1359,7 +1379,8 @@ _mm_cvttss_si32(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. +/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1371,13 +1392,15 @@ _mm_cvtt_ss2si(__m128 __a) return _mm_cvttss_si32(__a); } +#ifdef __x86_64__ /// \brief Converts a float value contained in the lower 32 bits of a vector of /// [4 x float] into a 64-bit integer, truncating the result when it is /// inexact. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. +/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1388,6 +1411,7 @@ _mm_cvttss_si64(__m128 __a) { return __builtin_ia32_cvttss2si64((__v4sf)__a); } +#endif /// \brief Converts two low-order float values in a 128-bit vector of /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result @@ -1395,7 +1419,8 @@ _mm_cvttss_si64(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions. +/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c> +/// instructions. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1412,7 +1437,7 @@ _mm_cvttps_pi32(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTTPS2PI instruction. +/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1430,7 +1455,7 @@ _mm_cvtt_ps2pi(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction. +/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1453,7 +1478,7 @@ _mm_cvtsi32_ss(__m128 __a, int __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction. +/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1477,7 +1502,7 @@ _mm_cvt_si2ss(__m128 __a, int __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction. +/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1502,7 +1527,7 @@ _mm_cvtsi64_ss(__m128 __a, long long __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPI2PS instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1525,7 +1550,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPI2PS instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. @@ -1546,7 +1571,7 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1558,13 +1583,13 @@ _mm_cvtss_f32(__m128 __a) return __a[0]; } -/// \brief Loads two packed float values from the address __p into the +/// \brief Loads two packed float values from the address \a __p into the /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits /// are copied from the low-order bits of the first operand. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction. +/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0] @@ -1585,13 +1610,13 @@ _mm_loadh_pi(__m128 __a, const __m64 *__p) return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); } -/// \brief Loads two packed float values from the address __p into the low-order -/// bits of a 128-bit vector of [4 x float]. The high-order bits are copied -/// from the high-order bits of the first operand. +/// \brief Loads two packed float values from the address \a __p into the +/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits +/// are copied from the high-order bits of the first operand. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction. +/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits @@ -1619,7 +1644,7 @@ _mm_loadl_pi(__m128 __a, const __m64 *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. /// /// \param __p /// A pointer to a 32-bit memory location containing a single-precision @@ -1642,13 +1667,13 @@ _mm_load_ss(const float *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c> /// instruction. /// /// \param __p /// A pointer to a float value to be loaded and duplicated. -/// \returns A 128-bit vector of [4 x float] containing the loaded -/// and duplicated values. +/// \returns A 128-bit vector of [4 x float] containing the loaded and +/// duplicated values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p) { @@ -1666,7 +1691,7 @@ _mm_load1_ps(const float *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction. +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. /// /// \param __p /// A pointer to a 128-bit memory location. The address of the memory @@ -1683,7 +1708,7 @@ _mm_load_ps(const float *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction. +/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. /// /// \param __p /// A pointer to a 128-bit memory location. The address of the memory @@ -1703,7 +1728,7 @@ _mm_loadu_ps(const float *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> /// instruction. /// /// \param __p @@ -1725,7 +1750,6 @@ _mm_loadr_ps(const float *__p) /// This intrinsic has no corresponding instruction. /// /// \returns A 128-bit vector of [4 x float] containing undefined values. - static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void) { @@ -1738,7 +1762,7 @@ _mm_undefined_ps(void) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. /// /// \param __w /// A single-precision floating-point value used to initialize the lower 32 @@ -1758,7 +1782,7 @@ _mm_set_ss(float __w) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. +/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. /// /// \param __w /// A single-precision floating-point value used to initialize each vector @@ -1777,7 +1801,7 @@ _mm_set1_ps(float __w) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction. +/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction. /// /// \param __w /// A single-precision floating-point value used to initialize each vector @@ -1849,7 +1873,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VXORPS / XORPS instruction. +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. /// /// \returns An initialized 128-bit floating-point vector of [4 x float] with /// all elements set to zero. @@ -1864,7 +1888,7 @@ _mm_setzero_ps(void) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction. +/// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction. /// /// \param __p /// A pointer to a 64-bit memory location. @@ -1881,7 +1905,7 @@ _mm_storeh_pi(__m64 *__p, __m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction. +/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. /// /// \param __p /// A pointer to a memory location that will receive the float values. @@ -1898,7 +1922,7 @@ _mm_storel_pi(__m64 *__p, __m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. /// /// \param __p /// A pointer to a 32-bit memory location. @@ -1913,12 +1937,12 @@ _mm_store_ss(float *__p, __m128 __a) ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; } -/// \brief Stores float values from a 128-bit vector of [4 x float] to an -/// unaligned memory location. +/// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory +/// location. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction. +/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. /// /// \param __p /// A pointer to a 128-bit memory location. The address of the memory @@ -1934,19 +1958,18 @@ _mm_storeu_ps(float *__p, __m128 __a) ((struct __storeu_ps*)__p)->__v = __a; } -/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into -/// four contiguous elements in an aligned memory location. +/// \brief Stores a 128-bit vector of [4 x float] into an aligned memory +/// location. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling -/// instruction. +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. /// /// \param __p -/// A pointer to a 128-bit memory location. +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 16-byte aligned. /// \param __a -/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each -/// of the four contiguous elements pointed by __p. +/// A 128-bit vector of [4 x float] containing the values to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a) { @@ -1958,14 +1981,14 @@ _mm_store_ps(float *__p, __m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling +/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> /// instruction. /// /// \param __p /// A pointer to a 128-bit memory location. /// \param __a /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each -/// of the four contiguous elements pointed by __p. +/// of the four contiguous elements pointed by \a __p. static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a) { @@ -1973,18 +1996,19 @@ _mm_store1_ps(float *__p, __m128 __a) _mm_store_ps(__p, __a); } -/// \brief Stores float values from a 128-bit vector of [4 x float] to an -/// aligned memory location. +/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into +/// four contiguous elements in an aligned memory location. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction. +/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c> +/// instruction. /// /// \param __p -/// A pointer to a 128-bit memory location. The address of the memory -/// location has to be 128-bit aligned. +/// A pointer to a 128-bit memory location. /// \param __a -/// A 128-bit vector of [4 x float] containing the values to be stored. +/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each +/// of the four contiguous elements pointed by \a __p. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a) { @@ -1996,7 +2020,7 @@ _mm_store_ps1(float *__p, __m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c> /// instruction. /// /// \param __p @@ -2029,20 +2053,21 @@ _mm_storer_ps(float *__p, __m128 __a) /// void _mm_prefetch(const void * a, const int sel); /// \endcode /// -/// This intrinsic corresponds to the \c PREFETCHNTA instruction. +/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction. /// /// \param a /// A pointer to a memory location containing a cache line of data. /// \param sel -/// A predefined integer constant specifying the type of prefetch operation: -/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. -/// The PREFETCHNTA instruction will be generated. +/// A predefined integer constant specifying the type of prefetch +/// operation: \n +/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The +/// PREFETCHNTA instruction will be generated. \n /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will -/// be generated. +/// be generated. \n /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will -/// be generated. +/// be generated. \n /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will -/// be generated. +/// be generated. #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) #endif @@ -2052,7 +2077,7 @@ _mm_storer_ps(float *__p, __m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c MOVNTQ instruction. +/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction. /// /// \param __p /// A pointer to an aligned memory location used to store the register value. @@ -2070,7 +2095,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction. +/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. /// /// \param __p /// A pointer to a 128-bit aligned memory location that will receive the @@ -2083,6 +2108,10 @@ _mm_stream_ps(float *__p, __m128 __a) __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); } +#if defined(__cplusplus) +extern "C" { +#endif + /// \brief Forces strong memory ordering (serialization) between store /// instructions preceding this instruction and store instructions following /// this instruction, ensuring the system completes all previous stores @@ -2090,28 +2119,32 @@ _mm_stream_ps(float *__p, __m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c SFENCE instruction. +/// This intrinsic corresponds to the <c> SFENCE </c> instruction. /// -static __inline__ void __DEFAULT_FN_ATTRS -_mm_sfence(void) -{ - __builtin_ia32_sfence(); -} +void _mm_sfence(void); + +#if defined(__cplusplus) +} // extern "C" +#endif /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and /// returns it, as specified by the immediate integer operand. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction. +/// \code +/// void _mm_extract_pi(__m64 a, int n); +/// \endcode /// -/// \param __a +/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. +/// +/// \param a /// A 64-bit vector of [4 x i16]. -/// \param __n -/// An immediate integer operand that determines which bits are extracted: -/// 0: Bits [15:0] are copied to the destination. -/// 1: Bits [31:16] are copied to the destination. -/// 2: Bits [47:32] are copied to the destination. +/// \param n +/// An immediate integer operand that determines which bits are extracted: \n +/// 0: Bits [15:0] are copied to the destination. \n +/// 1: Bits [31:16] are copied to the destination. \n +/// 2: Bits [47:32] are copied to the destination. \n /// 3: Bits [63:48] are copied to the destination. /// \returns A 16-bit integer containing the extracted 16 bits of packed data. #define _mm_extract_pi16(a, n) __extension__ ({ \ @@ -2119,26 +2152,30 @@ _mm_sfence(void) /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination, /// and inserts the lower 16-bits of an integer operand at the 16-bit offset -/// specified by the immediate operand __n. +/// specified by the immediate operand \a n. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction. +/// \code +/// void _mm_insert_pi(__m64 a, int d, int n); +/// \endcode /// -/// \param __a +/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. +/// +/// \param a /// A 64-bit vector of [4 x i16]. -/// \param __d +/// \param d /// An integer. The lower 16-bit value from this operand is written to the -/// destination at the offset specified by operand __n. -/// \param __n +/// destination at the offset specified by operand \a n. +/// \param n /// An immediate integer operant that determines which the bits to be used -/// in the destination. -/// 0: Bits [15:0] are copied to the destination. -/// 1: Bits [31:16] are copied to the destination. -/// 2: Bits [47:32] are copied to the destination. -/// 3: Bits [63:48] are copied to the destination. +/// in the destination. \n +/// 0: Bits [15:0] are copied to the destination. \n +/// 1: Bits [31:16] are copied to the destination. \n +/// 2: Bits [47:32] are copied to the destination. \n +/// 3: Bits [63:48] are copied to the destination. \n /// The remaining bits in the destination are copied from the corresponding -/// bits in operand __a. +/// bits in operand \a a. /// \returns A 64-bit integer vector containing the copied packed data from the /// operands. #define _mm_insert_pi16(a, d, n) __extension__ ({ \ @@ -2150,7 +2187,7 @@ _mm_sfence(void) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMAXSW instruction. +/// This intrinsic corresponds to the <c> PMAXSW </c> instruction. /// /// \param __a /// A 64-bit integer vector containing one of the source operands. @@ -2169,7 +2206,7 @@ _mm_max_pi16(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMAXUB instruction. +/// This intrinsic corresponds to the <c> PMAXUB </c> instruction. /// /// \param __a /// A 64-bit integer vector containing one of the source operands. @@ -2188,7 +2225,7 @@ _mm_max_pu8(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMINSW instruction. +/// This intrinsic corresponds to the <c> PMINSW </c> instruction. /// /// \param __a /// A 64-bit integer vector containing one of the source operands. @@ -2207,7 +2244,7 @@ _mm_min_pi16(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMINUB instruction. +/// This intrinsic corresponds to the <c> PMINUB </c> instruction. /// /// \param __a /// A 64-bit integer vector containing one of the source operands. @@ -2226,7 +2263,7 @@ _mm_min_pu8(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMOVMSKB instruction. +/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction. /// /// \param __a /// A 64-bit integer vector containing the values with bits to be extracted. @@ -2244,7 +2281,7 @@ _mm_movemask_pi8(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PMULHUW instruction. +/// This intrinsic corresponds to the <c> PMULHUW </c> instruction. /// /// \param __a /// A 64-bit integer vector containing one of the source operands. @@ -2262,27 +2299,31 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSHUFW instruction. -/// /// \code /// __m64 _mm_shuffle_pi16(__m64 a, const int n); /// \endcode /// +/// This intrinsic corresponds to the <c> PSHUFW </c> instruction. +/// /// \param a /// A 64-bit integer vector containing the values to be shuffled. /// \param n /// An immediate value containing an 8-bit value specifying which elements to -/// copy from a. The destinations within the 64-bit destination are assigned -/// values as follows: -/// Bits [1:0] are used to assign values to bits [15:0] in the destination. -/// Bits [3:2] are used to assign values to bits [31:16] in the destination. -/// Bits [5:4] are used to assign values to bits [47:32] in the destination. -/// Bits [7:6] are used to assign values to bits [63:48] in the destination. -/// Bit value assignments: -/// 00: assigned from bits [15:0] of a. -/// 01: assigned from bits [31:16] of a. -/// 10: assigned from bits [47:32] of a. -/// 11: assigned from bits [63:48] of a. +/// copy from \a a. The destinations within the 64-bit destination are +/// assigned values as follows: \n +/// Bits [1:0] are used to assign values to bits [15:0] in the +/// destination. \n +/// Bits [3:2] are used to assign values to bits [31:16] in the +/// destination. \n +/// Bits [5:4] are used to assign values to bits [47:32] in the +/// destination. \n +/// Bits [7:6] are used to assign values to bits [63:48] in the +/// destination. \n +/// Bit value assignments: \n +/// 00: assigned from bits [15:0] of \a a. \n +/// 01: assigned from bits [31:16] of \a a. \n +/// 10: assigned from bits [47:32] of \a a. \n +/// 11: assigned from bits [63:48] of \a a. /// \returns A 64-bit integer vector containing the shuffled values. #define _mm_shuffle_pi16(a, n) __extension__ ({ \ (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); }) @@ -2295,15 +2336,15 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c MASKMOVQ instruction. +/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction. /// /// \param __d /// A 64-bit integer vector containing the values with elements to be copied. /// \param __n /// A 64-bit integer vector operand. The most significant bit from each 8-bit -/// element determines whether the corresponding element in operand __d is -/// copied. If the most significant bit of a given element is 1, the -/// corresponding element in operand __d is copied. +/// element determines whether the corresponding element in operand \a __d +/// is copied. If the most significant bit of a given element is 1, the +/// corresponding element in operand \a __d is copied. /// \param __p /// A pointer to a 64-bit memory location that will receive the conditionally /// copied integer values. The address of the memory location does not have @@ -2320,7 +2361,7 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PAVGB instruction. +/// This intrinsic corresponds to the <c> PAVGB </c> instruction. /// /// \param __a /// A 64-bit integer vector containing one of the source operands. @@ -2339,7 +2380,7 @@ _mm_avg_pu8(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PAVGW instruction. +/// This intrinsic corresponds to the <c> PAVGW </c> instruction. /// /// \param __a /// A 64-bit integer vector containing one of the source operands. @@ -2359,7 +2400,7 @@ _mm_avg_pu16(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c PSADBW instruction. +/// This intrinsic corresponds to the <c> PSADBW </c> instruction. /// /// \param __a /// A 64-bit integer vector containing one of the source operands. @@ -2374,24 +2415,42 @@ _mm_sad_pu8(__m64 __a, __m64 __b) return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); } +#if defined(__cplusplus) +extern "C" { +#endif + /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned -/// integer value. There are several groups of macros associated with this +/// integer value. +/// +/// There are several groups of macros associated with this /// intrinsic, including: -/// * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, +/// <ul> +/// <li> +/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, /// _MM_EXCEPT_INEXACT. There is a convenience wrapper /// _MM_GET_EXCEPTION_STATE(). -/// * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, +/// </li> +/// <li> +/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). -/// * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, +/// </li> +/// <li> +/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper /// _MM_GET_ROUNDING_MODE(x) where x is one of these macros. -/// * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. +/// </li> +/// <li> +/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). -/// * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, +/// </li> +/// <li> +/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper /// _MM_GET_DENORMALS_ZERO_MODE(). +/// </li> +/// </ul> /// /// For example, the expression below checks if an overflow exception has /// occurred: @@ -2402,35 +2461,45 @@ _mm_sad_pu8(__m64 __a, __m64 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction. +/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction. /// /// \returns A 32-bit unsigned integer containing the contents of the MXCSR /// register. -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm_getcsr(void) -{ - return __builtin_ia32_stmxcsr(); -} - -/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There -/// are several groups of macros associated with this intrinsic, including: -/// * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, +unsigned int _mm_getcsr(void); + +/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. +/// +/// There are several groups of macros associated with this intrinsic, +/// including: +/// <ul> +/// <li> +/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, /// _MM_EXCEPT_INEXACT. There is a convenience wrapper /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros. -/// * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, +/// </li> +/// <li> +/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one /// of these macros. -/// * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, +/// </li> +/// <li> +/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros. -/// * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. +/// </li> +/// <li> +/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is /// one of these macros. -/// * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, +/// </li> +/// <li> +/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros. +/// </li> +/// </ul> /// /// For example, the following expression causes subsequent floating-point /// operations to round up: @@ -2444,15 +2513,15 @@ _mm_getcsr(void) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction. +/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction. /// /// \param __i /// A 32-bit unsigned integer value to be written to the MXCSR register. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_setcsr(unsigned int __i) -{ - __builtin_ia32_ldmxcsr(__i); -} +void _mm_setcsr(unsigned int); + +#if defined(__cplusplus) +} // extern "C" +#endif /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as /// specified by the immediate value operand. @@ -2463,7 +2532,7 @@ _mm_setcsr(unsigned int __i) /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); /// \endcode /// -/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction. +/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction. /// /// \param a /// A 128-bit vector of [4 x float]. @@ -2471,18 +2540,23 @@ _mm_setcsr(unsigned int __i) /// A 128-bit vector of [4 x float]. /// \param mask /// An immediate value containing an 8-bit value specifying which elements to -/// copy from a and b. -/// Bits [3:0] specify the values copied from operand a. -/// Bits [7:4] specify the values copied from operand b. The destinations -/// within the 128-bit destination are assigned values as follows: -/// Bits [1:0] are used to assign values to bits [31:0] in the destination. -/// Bits [3:2] are used to assign values to bits [63:32] in the destination. -/// Bits [5:4] are used to assign values to bits [95:64] in the destination. -/// Bits [7:6] are used to assign values to bits [127:96] in the destination. -/// Bit value assignments: -/// 00: Bits [31:0] copied from the specified operand. -/// 01: Bits [63:32] copied from the specified operand. -/// 10: Bits [95:64] copied from the specified operand. +/// copy from \ a and \a b. \n +/// Bits [3:0] specify the values copied from operand \a a. \n +/// Bits [7:4] specify the values copied from operand \a b. \n +/// The destinations within the 128-bit destination are assigned values as +/// follows: \n +/// Bits [1:0] are used to assign values to bits [31:0] in the +/// destination. \n +/// Bits [3:2] are used to assign values to bits [63:32] in the +/// destination. \n +/// Bits [5:4] are used to assign values to bits [95:64] in the +/// destination. \n +/// Bits [7:6] are used to assign values to bits [127:96] in the +/// destination. \n +/// Bit value assignments: \n +/// 00: Bits [31:0] copied from the specified operand. \n +/// 01: Bits [63:32] copied from the specified operand. \n +/// 10: Bits [95:64] copied from the specified operand. \n /// 11: Bits [127:96] copied from the specified operand. /// \returns A 128-bit vector of [4 x float] containing the shuffled values. #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ @@ -2493,20 +2567,19 @@ _mm_setcsr(unsigned int __i) 4 + (((mask) >> 6) & 0x3)); }) /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of -/// [4 x float] and interleaves them into a 128-bit vector of [4 x -/// float]. +/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction. +/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction. /// /// \param __a -/// A 128-bit vector of [4 x float]. -/// Bits [95:64] are written to bits [31:0] of the destination. +/// A 128-bit vector of [4 x float]. \n +/// Bits [95:64] are written to bits [31:0] of the destination. \n /// Bits [127:96] are written to bits [95:64] of the destination. /// \param __b /// A 128-bit vector of [4 x float]. -/// Bits [95:64] are written to bits [63:32] of the destination. +/// Bits [95:64] are written to bits [63:32] of the destination. \n /// Bits [127:96] are written to bits [127:96] of the destination. /// \returns A 128-bit vector of [4 x float] containing the interleaved values. static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -2516,20 +2589,19 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b) } /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of -/// [4 x float] and interleaves them into a 128-bit vector of [4 x -/// float]. +/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction. +/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction. /// /// \param __a -/// A 128-bit vector of [4 x float]. -/// Bits [31:0] are written to bits [31:0] of the destination. +/// A 128-bit vector of [4 x float]. \n +/// Bits [31:0] are written to bits [31:0] of the destination. \n /// Bits [63:32] are written to bits [95:64] of the destination. /// \param __b -/// A 128-bit vector of [4 x float]. -/// Bits [31:0] are written to bits [63:32] of the destination. +/// A 128-bit vector of [4 x float]. \n +/// Bits [31:0] are written to bits [63:32] of the destination. \n /// Bits [63:32] are written to bits [127:96] of the destination. /// \returns A 128-bit vector of [4 x float] containing the interleaved values. static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -2544,7 +2616,7 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction. +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. /// /// \param __a /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are @@ -2565,7 +2637,7 @@ _mm_move_ss(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction. +/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. /// /// \param __a /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are @@ -2586,7 +2658,7 @@ _mm_movehl_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction. +/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. /// /// \param __a /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are @@ -2606,7 +2678,8 @@ _mm_movelh_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> +/// instruction. /// /// \param __a /// A 64-bit vector of [4 x i16]. The elements of the destination are copied @@ -2636,7 +2709,8 @@ _mm_cvtpi16_ps(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> +/// instruction. /// /// \param __a /// A 64-bit vector of 16-bit unsigned integer values. The elements of the @@ -2665,7 +2739,8 @@ _mm_cvtpu16_ps(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> +/// instruction. /// /// \param __a /// A 64-bit vector of [8 x i8]. The elements of the destination are copied @@ -2689,7 +2764,8 @@ _mm_cvtpi8_ps(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> +/// instruction. /// /// \param __a /// A 64-bit vector of unsigned 8-bit integer values. The elements of the @@ -2713,7 +2789,8 @@ _mm_cvtpu8_ps(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> +/// instruction. /// /// \param __a /// A 64-bit vector of [2 x i32]. The lower elements of the destination are @@ -2741,12 +2818,13 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) /// packs the results into a 64-bit integer vector of [4 x i16]. If the /// floating-point element is NaN or infinity, or if the floating-point /// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted -/// to 0x8000. Otherwise if the floating-point element is greater -/// than 0x7FFF, it is converted to 0x7FFF. +/// to 0x8000. Otherwise if the floating-point element is greater than +/// 0x7FFF, it is converted to 0x7FFF. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction. +/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c> +/// instruction. /// /// \param __a /// A 128-bit floating-point vector of [4 x float]. @@ -2770,12 +2848,13 @@ _mm_cvtps_pi16(__m128 __a) /// [8 x i8]. The upper 32 bits of the vector are set to 0. If the /// floating-point element is NaN or infinity, or if the floating-point /// element is greater than 0x7FFFFFFF or less than -0x80, it is converted -/// to 0x80. Otherwise if the floating-point element is greater -/// than 0x7F, it is converted to 0x7F. +/// to 0x80. Otherwise if the floating-point element is greater than 0x7F, +/// it is converted to 0x7F. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction. +/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c> +/// instruction. /// /// \param __a /// 128-bit floating-point vector of [4 x float]. @@ -2799,7 +2878,7 @@ _mm_cvtps_pi8(__m128 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction. +/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction. /// /// \param __a /// A 128-bit floating-point vector of [4 x float]. |