summaryrefslogtreecommitdiff
path: root/lib/Headers
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-01-02 19:18:08 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-01-02 19:18:08 +0000
commitbab175ec4b075c8076ba14c762900392533f6ee4 (patch)
tree01f4f29419a2cb10abe13c1e63cd2a66068b0137 /lib/Headers
parent8b7a8012d223fac5d17d16a66bb39168a9a1dfc0 (diff)
downloadsrc-test2-bab175ec4b075c8076ba14c762900392533f6ee4.tar.gz
src-test2-bab175ec4b075c8076ba14c762900392533f6ee4.zip
Notes
Diffstat (limited to 'lib/Headers')
-rw-r--r--lib/Headers/CMakeLists.txt18
-rw-r--r--lib/Headers/__clang_cuda_builtin_vars.h (renamed from lib/Headers/cuda_builtin_vars.h)0
-rw-r--r--lib/Headers/__clang_cuda_cmath.h341
-rw-r--r--lib/Headers/__clang_cuda_complex_builtins.h203
-rw-r--r--lib/Headers/__clang_cuda_math_forward_declares.h25
-rw-r--r--lib/Headers/__clang_cuda_runtime_wrapper.h46
-rw-r--r--lib/Headers/__wmmintrin_aes.h12
-rw-r--r--lib/Headers/__wmmintrin_pclmul.h15
-rw-r--r--lib/Headers/altivec.h2274
-rw-r--r--lib/Headers/ammintrin.h77
-rw-r--r--lib/Headers/armintr.h45
-rw-r--r--lib/Headers/avx512bwintrin.h428
-rw-r--r--lib/Headers/avx512dqintrin.h437
-rw-r--r--lib/Headers/avx512fintrin.h2346
-rw-r--r--lib/Headers/avx512vlbwintrin.h1822
-rw-r--r--lib/Headers/avx512vldqintrin.h439
-rw-r--r--lib/Headers/avx512vlintrin.h2470
-rw-r--r--lib/Headers/avxintrin.h3096
-rw-r--r--lib/Headers/bmiintrin.h64
-rw-r--r--lib/Headers/cuda_wrappers/algorithm96
-rw-r--r--lib/Headers/cuda_wrappers/complex82
-rw-r--r--lib/Headers/cuda_wrappers/new47
-rw-r--r--lib/Headers/emmintrin.h2546
-rw-r--r--lib/Headers/f16cintrin.h28
-rw-r--r--lib/Headers/float.h7
-rw-r--r--lib/Headers/fxsrintrin.h62
-rw-r--r--lib/Headers/ia32intrin.h6
-rw-r--r--lib/Headers/immintrin.h35
-rw-r--r--lib/Headers/intrin.h682
-rw-r--r--lib/Headers/lzcntintrin.h50
-rw-r--r--lib/Headers/mmintrin.h174
-rw-r--r--lib/Headers/module.modulemap2
-rw-r--r--lib/Headers/opencl-c.h655
-rw-r--r--lib/Headers/pmmintrin.h53
-rw-r--r--lib/Headers/popcntintrin.h8
-rw-r--r--lib/Headers/stdatomic.h10
-rw-r--r--lib/Headers/tmmintrin.h42
-rw-r--r--lib/Headers/xmmintrin.h579
38 files changed, 13669 insertions, 5653 deletions
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt
index fa2d2107781b..efc4dd0971b6 100644
--- a/lib/Headers/CMakeLists.txt
+++ b/lib/Headers/CMakeLists.txt
@@ -3,6 +3,7 @@ set(files
altivec.h
ammintrin.h
arm_acle.h
+ armintr.h
avx2intrin.h
avx512bwintrin.h
avx512cdintrin.h
@@ -21,12 +22,13 @@ set(files
avxintrin.h
bmi2intrin.h
bmiintrin.h
+ __clang_cuda_builtin_vars.h
__clang_cuda_cmath.h
+ __clang_cuda_complex_builtins.h
__clang_cuda_intrinsics.h
__clang_cuda_math_forward_declares.h
__clang_cuda_runtime_wrapper.h
cpuid.h
- cuda_builtin_vars.h
clflushoptintrin.h
emmintrin.h
f16cintrin.h
@@ -88,6 +90,12 @@ set(files
xtestintrin.h
)
+set(cuda_wrapper_files
+ cuda_wrappers/algorithm
+ cuda_wrappers/complex
+ cuda_wrappers/new
+)
+
set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
# Generate arm_neon.h
@@ -95,7 +103,7 @@ clang_tablegen(arm_neon.h -gen-arm-neon
SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td)
set(out_files)
-foreach( f ${files} )
+foreach( f ${files} ${cuda_wrapper_files} )
set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
set( dst ${output_dir}/${f} )
add_custom_command(OUTPUT ${dst}
@@ -120,6 +128,12 @@ install(
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
+install(
+ FILES ${cuda_wrapper_files}
+ COMPONENT clang-headers
+ PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+ DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include/cuda_wrappers)
+
if (NOT CMAKE_CONFIGURATION_TYPES) # don't add this for IDE's.
add_custom_target(install-clang-headers
DEPENDS clang-headers
diff --git a/lib/Headers/cuda_builtin_vars.h b/lib/Headers/__clang_cuda_builtin_vars.h
index 6f5eb9c78d85..6f5eb9c78d85 100644
--- a/lib/Headers/cuda_builtin_vars.h
+++ b/lib/Headers/__clang_cuda_builtin_vars.h
diff --git a/lib/Headers/__clang_cuda_cmath.h b/lib/Headers/__clang_cuda_cmath.h
index ae7ff2f8d306..0eaa08b30cab 100644
--- a/lib/Headers/__clang_cuda_cmath.h
+++ b/lib/Headers/__clang_cuda_cmath.h
@@ -26,13 +26,15 @@
#error "This file is for CUDA compilation only."
#endif
+#include <limits>
+
// CUDA lets us use various std math functions on the device side. This file
// works in concert with __clang_cuda_math_forward_declares.h to make this work.
//
// Specifically, the forward-declares header declares __device__ overloads for
// these functions in the global namespace, then pulls them into namespace std
// with 'using' statements. Then this file implements those functions, after
-// the implementations have been pulled in.
+// their implementations have been pulled in.
//
// It's important that we declare the functions in the global namespace and pull
// them into namespace std with using statements, as opposed to simply declaring
@@ -73,7 +75,10 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
+// For inscrutable reasons, __finite(), the double-precision version of
+// __finitef, does not exist when compiling for MacOS. __isfinited is available
+// everywhere and is just as good.
+__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
__DEVICE__ bool isgreater(float __x, float __y) {
return __builtin_isgreater(__x, __y);
}
@@ -120,12 +125,15 @@ __DEVICE__ float ldexp(float __arg, int __exp) {
__DEVICE__ float log(float __x) { return ::logf(__x); }
__DEVICE__ float log10(float __x) { return ::log10f(__x); }
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
-__DEVICE__ float nexttoward(float __from, float __to) {
+__DEVICE__ float nexttoward(float __from, double __to) {
return __builtin_nexttowardf(__from, __to);
}
__DEVICE__ double nexttoward(double __from, double __to) {
return __builtin_nexttoward(__from, __to);
}
+__DEVICE__ float nexttowardf(float __from, double __to) {
+ return __builtin_nexttowardf(__from, __to);
+}
__DEVICE__ float pow(float __base, float __exp) {
return ::powf(__base, __exp);
}
@@ -136,13 +144,338 @@ __DEVICE__ double pow(double __base, int __iexp) {
return ::powi(__base, __iexp);
}
__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
__DEVICE__ float sin(float __x) { return ::sinf(__x); }
__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
+// Now we've defined everything we promised we'd define in
+// __clang_cuda_math_forward_declares.h. We need to do two additional things to
+// fix up our math functions.
+//
+// 1) Define __device__ overloads for e.g. sin(int). The CUDA headers define
+// only sin(float) and sin(double), which means that e.g. sin(0) is
+// ambiguous.
+//
+// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
+// std. These are defined in the CUDA headers in the global namespace,
+// independent of everything else we've done here.
+
+// We can't use std::enable_if, because we want to be pre-C++11 compatible. But
+// we go ahead and unconditionally define functions that are only available when
+// compiling for C++11 to match the behavior of the CUDA headers.
+template<bool __B, class __T = void>
+struct __clang_cuda_enable_if {};
+
+template <class __T> struct __clang_cuda_enable_if<true, __T> {
+ typedef __T type;
+};
+
+// Defines an overload of __fn that accepts one integral argument, calls
+// __fn((double)x), and returns __retty.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn) \
+ template <typename __T> \
+ __DEVICE__ \
+ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, \
+ __retty>::type \
+ __fn(__T __x) { \
+ return ::__fn((double)__x); \
+ }
+
+// Defines an overload of __fn that accepts one two arithmetic arguments, calls
+// __fn((double)x, (double)y), and returns a double.
+//
+// Note this is different from OVERLOAD_1, which generates an overload that
+// accepts only *integral* arguments.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn) \
+ template <typename __T1, typename __T2> \
+ __DEVICE__ typename __clang_cuda_enable_if< \
+ std::numeric_limits<__T1>::is_specialized && \
+ std::numeric_limits<__T2>::is_specialized, \
+ __retty>::type \
+ __fn(__T1 __x, __T2 __y) { \
+ return __fn((double)__x, (double)__y); \
+ }
+
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
+
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
+
+// Overloads for functions that don't match the patterns expected by
+// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__ typename __clang_cuda_enable_if<
+ std::numeric_limits<__T1>::is_specialized &&
+ std::numeric_limits<__T2>::is_specialized &&
+ std::numeric_limits<__T3>::is_specialized,
+ double>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
+ return std::fma((double)__x, (double)__y, (double)__z);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+ double>::type
+frexp(__T __x, int *__exp) {
+ return std::frexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+ double>::type
+ldexp(__T __x, int __exp) {
+ return std::ldexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+ double>::type
+nexttoward(__T __from, double __to) {
+ return std::nexttoward((double)__from, __to);
+}
+
+template <typename __T1, typename __T2>
+__DEVICE__ typename __clang_cuda_enable_if<
+ std::numeric_limits<__T1>::is_specialized &&
+ std::numeric_limits<__T2>::is_specialized,
+ double>::type
+remquo(__T1 __x, __T2 __y, int *__quo) {
+ return std::remquo((double)__x, (double)__y, __quo);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+ double>::type
+scalbln(__T __x, long __exp) {
+ return std::scalbln((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+ double>::type
+scalbn(__T __x, int __exp) {
+ return std::scalbn((double)__x, __exp);
+}
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>). Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// Pull the new overloads we defined above into namespace std.
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::llround;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::nearbyint;
+using ::nextafter;
+using ::nexttoward;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+// Well this is fun: We need to pull these symbols in for libc++, but we can't
+// pull them in with libstdc++, because its ::isinf and ::isnan are different
+// than its std::isinf and std::isnan.
+#ifndef __GLIBCXX__
+using ::isinf;
+using ::isnan;
+#endif
+
+// Finally, pull the "foobarf" functions that CUDA defines in its headers into
+// namespace std.
+using ::acosf;
+using ::acoshf;
+using ::asinf;
+using ::asinhf;
+using ::atan2f;
+using ::atanf;
+using ::atanhf;
+using ::cbrtf;
+using ::ceilf;
+using ::copysignf;
+using ::cosf;
+using ::coshf;
+using ::erfcf;
+using ::erff;
+using ::exp2f;
+using ::expf;
+using ::expm1f;
+using ::fabsf;
+using ::fdimf;
+using ::floorf;
+using ::fmaf;
+using ::fmaxf;
+using ::fminf;
+using ::fmodf;
+using ::frexpf;
+using ::hypotf;
+using ::ilogbf;
+using ::ldexpf;
+using ::lgammaf;
+using ::llrintf;
+using ::llroundf;
+using ::log10f;
+using ::log1pf;
+using ::log2f;
+using ::logbf;
+using ::logf;
+using ::lrintf;
+using ::lroundf;
+using ::modff;
+using ::nearbyintf;
+using ::nextafterf;
+using ::nexttowardf;
+using ::nexttowardf;
+using ::powf;
+using ::remainderf;
+using ::remquof;
+using ::rintf;
+using ::roundf;
+using ::scalblnf;
+using ::scalbnf;
+using ::sinf;
+using ::sinhf;
+using ::sqrtf;
+using ::tanf;
+using ::tanhf;
+using ::tgammaf;
+using ::truncf;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
#undef __DEVICE__
#endif
diff --git a/lib/Headers/__clang_cuda_complex_builtins.h b/lib/Headers/__clang_cuda_complex_builtins.h
new file mode 100644
index 000000000000..beef7deff87f
--- /dev/null
+++ b/lib/Headers/__clang_cuda_complex_builtins.h
@@ -0,0 +1,203 @@
+/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
+#define __CLANG_CUDA_COMPLEX_BUILTINS
+
+// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are
+// libgcc functions that clang assumes are available when compiling c99 complex
+// operations. (These implementations come from libc++, and have been modified
+// to work with CUDA.)
+
+extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
+ double __c, double __d) {
+ double __ac = __a * __c;
+ double __bd = __b * __d;
+ double __ad = __a * __d;
+ double __bc = __b * __c;
+ double _Complex z;
+ __real__(z) = __ac - __bd;
+ __imag__(z) = __ad + __bc;
+ if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+ int __recalc = 0;
+ if (std::isinf(__a) || std::isinf(__b)) {
+ __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+ __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+ if (std::isnan(__c))
+ __c = std::copysign(0, __c);
+ if (std::isnan(__d))
+ __d = std::copysign(0, __d);
+ __recalc = 1;
+ }
+ if (std::isinf(__c) || std::isinf(__d)) {
+ __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+ __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+ if (std::isnan(__a))
+ __a = std::copysign(0, __a);
+ if (std::isnan(__b))
+ __b = std::copysign(0, __b);
+ __recalc = 1;
+ }
+ if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+ std::isinf(__ad) || std::isinf(__bc))) {
+ if (std::isnan(__a))
+ __a = std::copysign(0, __a);
+ if (std::isnan(__b))
+ __b = std::copysign(0, __b);
+ if (std::isnan(__c))
+ __c = std::copysign(0, __c);
+ if (std::isnan(__d))
+ __d = std::copysign(0, __d);
+ __recalc = 1;
+ }
+ if (__recalc) {
+ // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
+ // a device overload (and isn't constexpr before C++11, naturally).
+ __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+ __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+ }
+ }
+ return z;
+}
+
+extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
+ float __c, float __d) {
+ float __ac = __a * __c;
+ float __bd = __b * __d;
+ float __ad = __a * __d;
+ float __bc = __b * __c;
+ float _Complex z;
+ __real__(z) = __ac - __bd;
+ __imag__(z) = __ad + __bc;
+ if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+ int __recalc = 0;
+ if (std::isinf(__a) || std::isinf(__b)) {
+ __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+ __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+ if (std::isnan(__c))
+ __c = std::copysign(0, __c);
+ if (std::isnan(__d))
+ __d = std::copysign(0, __d);
+ __recalc = 1;
+ }
+ if (std::isinf(__c) || std::isinf(__d)) {
+ __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+ __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+ if (std::isnan(__a))
+ __a = std::copysign(0, __a);
+ if (std::isnan(__b))
+ __b = std::copysign(0, __b);
+ __recalc = 1;
+ }
+ if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+ std::isinf(__ad) || std::isinf(__bc))) {
+ if (std::isnan(__a))
+ __a = std::copysign(0, __a);
+ if (std::isnan(__b))
+ __b = std::copysign(0, __b);
+ if (std::isnan(__c))
+ __c = std::copysign(0, __c);
+ if (std::isnan(__d))
+ __d = std::copysign(0, __d);
+ __recalc = 1;
+ }
+ if (__recalc) {
+ __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+ __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+ }
+ }
+ return z;
+}
+
+extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
+ double __c, double __d) {
+ int __ilogbw = 0;
+ // Can't use std::max, because that's defined in <algorithm>, and we don't
+ // want to pull that in for every compile. The CUDA headers define
+ // ::max(float, float) and ::max(double, double), which is sufficient for us.
+ double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+ if (std::isfinite(__logbw)) {
+ __ilogbw = (int)__logbw;
+ __c = std::scalbn(__c, -__ilogbw);
+ __d = std::scalbn(__d, -__ilogbw);
+ }
+ double __denom = __c * __c + __d * __d;
+ double _Complex z;
+ __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+ __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+ if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+ if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
+ __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+ __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+ } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+ std::isfinite(__d)) {
+ __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
+ __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
+ __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+ __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+ } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
+ std::isfinite(__b)) {
+ __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
+ __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
+ __real__(z) = 0.0 * (__a * __c + __b * __d);
+ __imag__(z) = 0.0 * (__b * __c - __a * __d);
+ }
+ }
+ return z;
+}
+
+extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
+ float __c, float __d) {
+ int __ilogbw = 0;
+ float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+ if (std::isfinite(__logbw)) {
+ __ilogbw = (int)__logbw;
+ __c = std::scalbn(__c, -__ilogbw);
+ __d = std::scalbn(__d, -__ilogbw);
+ }
+ float __denom = __c * __c + __d * __d;
+ float _Complex z;
+ __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+ __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+ if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+ if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
+ __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+ __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+ } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+ std::isfinite(__d)) {
+ __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+ __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+ __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+ __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+ } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
+ std::isfinite(__b)) {
+ __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+ __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+ __real__(z) = 0 * (__a * __c + __b * __d);
+ __imag__(z) = 0 * (__b * __c - __a * __d);
+ }
+ }
+ return z;
+}
+
+#endif // __CLANG_CUDA_COMPLEX_BUILTINS
diff --git a/lib/Headers/__clang_cuda_math_forward_declares.h b/lib/Headers/__clang_cuda_math_forward_declares.h
index 3f2834d95000..49c805151d65 100644
--- a/lib/Headers/__clang_cuda_math_forward_declares.h
+++ b/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -140,6 +140,7 @@ __DEVICE__ long lrint(double);
__DEVICE__ long lrint(float);
__DEVICE__ long lround(double);
__DEVICE__ long lround(float);
+__DEVICE__ long long llround(float); // No llround(double).
__DEVICE__ double modf(double, double *);
__DEVICE__ float modf(float, float *);
__DEVICE__ double nan(const char *);
@@ -149,7 +150,8 @@ __DEVICE__ float nearbyint(float);
__DEVICE__ double nextafter(double, double);
__DEVICE__ float nextafter(float, float);
__DEVICE__ double nexttoward(double, double);
-__DEVICE__ float nexttoward(float, float);
+__DEVICE__ float nexttoward(float, double);
+__DEVICE__ float nexttowardf(float, double);
__DEVICE__ double pow(double, double);
__DEVICE__ double pow(double, int);
__DEVICE__ float pow(float, float);
@@ -183,7 +185,19 @@ __DEVICE__ float tgamma(float);
__DEVICE__ double trunc(double);
__DEVICE__ float trunc(float);
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>). Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
using ::abs;
using ::acos;
using ::acosh;
@@ -235,6 +249,7 @@ using ::log2;
using ::logb;
using ::lrint;
using ::lround;
+using ::llround;
using ::modf;
using ::nan;
using ::nanf;
@@ -256,7 +271,15 @@ using ::tan;
using ::tanh;
using ::tgamma;
using ::trunc;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
} // namespace std
+#endif
#pragma pop_macro("__DEVICE__")
diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h
index 6445f9b76b8f..205e15b40b5d 100644
--- a/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -62,7 +62,7 @@
#include "cuda.h"
#if !defined(CUDA_VERSION)
#error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
#error "Unsupported CUDA version!"
#endif
@@ -72,9 +72,9 @@
#define __CUDA_ARCH__ 350
#endif
-#include "cuda_builtin_vars.h"
+#include "__clang_cuda_builtin_vars.h"
-// No need for device_launch_parameters.h as cuda_builtin_vars.h above
+// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
// has taken care of builtin variables declared in the file.
#define __DEVICE_LAUNCH_PARAMETERS_H__
@@ -113,6 +113,7 @@
#undef __cxa_vec_ctor
#undef __cxa_vec_cctor
#undef __cxa_vec_dtor
+#undef __cxa_vec_new
#undef __cxa_vec_new2
#undef __cxa_vec_new3
#undef __cxa_vec_delete2
@@ -120,6 +121,15 @@
#undef __cxa_vec_delete3
#undef __cxa_pure_virtual
+// math_functions.hpp expects this host function be defined on MacOS, but it
+// ends up not being there because of the games we play here. Just define it
+// ourselves; it's simple enough.
+#ifdef __APPLE__
+inline __host__ double __signbitd(double x) {
+ return std::signbit(x);
+}
+#endif
+
// We need decls for functions in CUDA's libdevice with __device__
// attribute only. Alas they come either as __host__ __device__ or
// with no attributes at all. To work around that, define __CUDA_RTC__
@@ -135,6 +145,21 @@
// the headers we're about to include.
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
+// Previous versions used to check whether they are defined or not.
+// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
+// here to detect the switch.
+
+#if defined(CU_DEVICE_INVALID)
+#if !defined(__USE_FAST_MATH__)
+#define __USE_FAST_MATH__ 0
+#endif
+
+#if !defined(__CUDA_PREC_DIV)
+#define __CUDA_PREC_DIV 0
+#endif
+#endif
+
// device_functions.hpp and math_functions*.hpp use 'static
// __forceinline__' (with no __device__) for definitions of device
// functions. Temporarily redefine __forceinline__ to include
@@ -151,7 +176,7 @@
// slow divides), so we need to scope our define carefully here.
#pragma push_macro("__USE_FAST_MATH__")
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
-#define __USE_FAST_MATH__
+#define __USE_FAST_MATH__ 1
#endif
#include "math_functions.hpp"
#pragma pop_macro("__USE_FAST_MATH__")
@@ -267,8 +292,8 @@ __device__ static inline void *malloc(size_t __size) {
}
} // namespace std
-// Out-of-line implementations from cuda_builtin_vars.h. These need to come
-// after we've pulled in the definition of uint3 and dim3.
+// Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to
+// come after we've pulled in the definition of uint3 and dim3.
__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
uint3 ret;
@@ -296,13 +321,14 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
#include <__clang_cuda_cmath.h>
#include <__clang_cuda_intrinsics.h>
+#include <__clang_cuda_complex_builtins.h>
// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
// mode, giving them their "proper" types of dim3 and uint3. This is
-// incompatible with the types we give in cuda_builtin_vars.h. As as hack,
-// force-include the header (nvcc doesn't include it by default) but redefine
-// dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are only
-// used here for the redeclarations of blockDim and threadIdx.)
+// incompatible with the types we give in __clang_cuda_builtin_vars.h. As as
+// hack, force-include the header (nvcc doesn't include it by default) but
+// redefine dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are
+// only used here for the redeclarations of blockDim and threadIdx.)
#pragma push_macro("dim3")
#pragma push_macro("uint3")
#define dim3 __cuda_builtin_blockDim_t
diff --git a/lib/Headers/__wmmintrin_aes.h b/lib/Headers/__wmmintrin_aes.h
index 211518eb2884..3a2ee1b2ef2e 100644
--- a/lib/Headers/__wmmintrin_aes.h
+++ b/lib/Headers/__wmmintrin_aes.h
@@ -35,7 +35,7 @@
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VAESENC instruction.
+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
@@ -55,7 +55,7 @@ _mm_aesenc_si128(__m128i __V, __m128i __R)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VAESENCLAST instruction.
+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
@@ -75,7 +75,7 @@ _mm_aesenclast_si128(__m128i __V, __m128i __R)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VAESDEC instruction.
+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
@@ -95,7 +95,7 @@ _mm_aesdec_si128(__m128i __V, __m128i __R)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VAESDECLAST instruction.
+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
@@ -114,7 +114,7 @@ _mm_aesdeclast_si128(__m128i __V, __m128i __R)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VAESIMC instruction.
+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the expanded key.
@@ -136,7 +136,7 @@ _mm_aesimc_si128(__m128i __V)
/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
/// \endcode
///
-/// This intrinsic corresponds to the \c AESKEYGENASSIST instruction.
+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
///
/// \param C
/// A 128-bit integer vector that is used to generate the AES encryption key.
diff --git a/lib/Headers/__wmmintrin_pclmul.h b/lib/Headers/__wmmintrin_pclmul.h
index d4e073f40688..e9c6a9f6d415 100644
--- a/lib/Headers/__wmmintrin_pclmul.h
+++ b/lib/Headers/__wmmintrin_pclmul.h
@@ -34,7 +34,7 @@
/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPCLMULQDQ instruction.
+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
///
/// \param __X
/// A 128-bit vector of [2 x i64] containing one of the source operands.
@@ -42,13 +42,12 @@
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param __I
/// An immediate value specifying which 64-bit values to select from the
-/// operands.
-/// Bit 0 is used to select a value from operand __X,
-/// and bit 4 is used to select a value from operand __Y:
-/// Bit[0]=0 indicates that bits[63:0] of operand __X are used.
-/// Bit[0]=1 indicates that bits[127:64] of operand __X are used.
-/// Bit[4]=0 indicates that bits[63:0] of operand __Y are used.
-/// Bit[4]=1 indicates that bits[127:64] of operand __Y are used.
+/// operands. Bit 0 is used to select a value from operand \a __X, and bit
+/// 4 is used to select a value from operand \a __Y: \n
+/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
+/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
+/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
+/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
/// \returns The 128-bit integer vector containing the result of the carry-less
/// multiplication of the selected 64-bit values.
#define _mm_clmulepi64_si128(__X, __Y, __I) \
diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index 74a1914ce83b..d1d1d8026325 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h
@@ -34,8 +34,31 @@
#define __CR6_LT 2
#define __CR6_LT_REV 3
+/* Constants for vec_test_data_class */
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \
+ __VEC_CLASS_FP_SUBNORMAL_N)
+#define __VEC_CLASS_FP_ZERO_N (1<<2)
+#define __VEC_CLASS_FP_ZERO_P (1<<3)
+#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P | \
+ __VEC_CLASS_FP_ZERO_N)
+#define __VEC_CLASS_FP_INFINITY_N (1<<4)
+#define __VEC_CLASS_FP_INFINITY_P (1<<5)
+#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P | \
+ __VEC_CLASS_FP_INFINITY_N)
+#define __VEC_CLASS_FP_NAN (1<<6)
+#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN | \
+ __VEC_CLASS_FP_SUBNORMAL | \
+ __VEC_CLASS_FP_ZERO | \
+ __VEC_CLASS_FP_INFINITY)
+
#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+#ifdef __POWER9_VECTOR__
+#include <stddef.h>
+#endif
+
static __inline__ vector signed char __ATTRS_o_ai vec_perm(
vector signed char __a, vector signed char __b, vector unsigned char __c);
@@ -134,7 +157,7 @@ static __inline__ vector float __ATTRS_o_ai vec_abs(vector float __a) {
#endif
}
-#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+#ifdef __VSX__
static __inline__ vector double __ATTRS_o_ai vec_abs(vector double __a) {
return __builtin_vsx_xvabsdp(__a);
}
@@ -163,6 +186,26 @@ vec_abss(vector signed int __a) {
__a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
}
+/* vec_absd */
+#if defined(__POWER9_VECTOR__)
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_absd(vector unsigned char __a, vector unsigned char __b) {
+ return __builtin_altivec_vabsdub(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_absd(vector unsigned short __a, vector unsigned short __b) {
+ return __builtin_altivec_vabsduh(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_absd(vector unsigned int __a, vector unsigned int __b) {
+ return __builtin_altivec_vabsduw(__a, __b);
+}
+
+#endif /* End __POWER9_VECTOR__ */
+
/* vec_add */
static __inline__ vector signed char __ATTRS_o_ai
@@ -305,6 +348,22 @@ vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b,
}
#endif
+static __inline__ vector signed int __ATTRS_o_ai
+vec_adde(vector signed int __a, vector signed int __b,
+ vector signed int __c) {
+ vector signed int __mask = {1, 1, 1, 1};
+ vector signed int __carry = __c & __mask;
+ return vec_add(vec_add(__a, __b), __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adde(vector unsigned int __a, vector unsigned int __b,
+ vector unsigned int __c) {
+ vector unsigned int __mask = {1, 1, 1, 1};
+ vector unsigned int __carry = __c & __mask;
+ return vec_add(vec_add(__a, __b), __carry);
+}
+
/* vec_addec */
#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
@@ -319,6 +378,50 @@ vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b,
vector unsigned __int128 __c) {
return __builtin_altivec_vaddecuq(__a, __b, __c);
}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_addec(vector signed int __a, vector signed int __b,
+ vector signed int __c) {
+
+ signed int __result[4];
+ for (int i = 0; i < 4; i++) {
+ unsigned int __tempa = (unsigned int) __a[i];
+ unsigned int __tempb = (unsigned int) __b[i];
+ unsigned int __tempc = (unsigned int) __c[i];
+ __tempc = __tempc & 0x00000001;
+ unsigned long long __longa = (unsigned long long) __tempa;
+ unsigned long long __longb = (unsigned long long) __tempb;
+ unsigned long long __longc = (unsigned long long) __tempc;
+ unsigned long long __sum = __longa + __longb + __longc;
+ unsigned long long __res = (__sum >> 32) & 0x01;
+ unsigned long long __tempres = (unsigned int) __res;
+ __result[i] = (signed int) __tempres;
+ }
+
+ vector signed int ret = { __result[0], __result[1], __result[2], __result[3] };
+ return ret;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_addec(vector unsigned int __a, vector unsigned int __b,
+ vector unsigned int __c) {
+
+ unsigned int __result[4];
+ for (int i = 0; i < 4; i++) {
+ unsigned int __tempc = __c[i] & 1;
+ unsigned long long __longa = (unsigned long long) __a[i];
+ unsigned long long __longb = (unsigned long long) __b[i];
+ unsigned long long __longc = (unsigned long long) __tempc;
+ unsigned long long __sum = __longa + __longb + __longc;
+ unsigned long long __res = (__sum >> 32) & 0x01;
+ unsigned long long __tempres = (unsigned int) __res;
+ __result[i] = (signed int) __tempres;
+ }
+
+ vector unsigned int ret = { __result[0], __result[1], __result[2], __result[3] };
+ return ret;
+}
+
#endif
/* vec_vaddubm */
@@ -1544,6 +1647,12 @@ vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
(vector char)__b);
}
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+ return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+ (vector char)__b);
+}
+
static __inline__ vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
vector short __b) {
return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
@@ -1555,6 +1664,12 @@ vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
(vector short)__b);
}
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+ return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+ (vector short)__b);
+}
+
static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a,
vector int __b) {
return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
@@ -1566,6 +1681,12 @@ vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
(vector int)__b);
}
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector bool int __a,
+ vector bool int __b) {
+ return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+ (vector int)__b);
+}
+
#ifdef __POWER8_VECTOR__
static __inline__ vector bool long long __ATTRS_o_ai
vec_cmpeq(vector signed long long __a, vector signed long long __b) {
@@ -1577,6 +1698,13 @@ vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
return (vector bool long long)__builtin_altivec_vcmpequd(
(vector long long)__a, (vector long long)__b);
}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+ return (vector bool long long)__builtin_altivec_vcmpequd(
+ (vector long long)__a, (vector long long)__b);
+}
+
#endif
static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
@@ -1595,6 +1723,199 @@ vec_cmpeq(vector double __a, vector double __b) {
}
#endif
+#ifdef __POWER9_VECTOR__
+/* vec_cmpne */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector bool char __a, vector bool char __b) {
+ return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+ (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector signed char __a, vector signed char __b) {
+ return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+ (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector unsigned char __a, vector unsigned char __b) {
+ return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+ (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector bool short __a, vector bool short __b) {
+ return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+ (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector signed short __a, vector signed short __b) {
+ return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+ (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector unsigned short __a, vector unsigned short __b) {
+ return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+ (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector bool int __a, vector bool int __b) {
+ return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector signed int __a, vector signed int __b) {
+ return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
+ return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector bool long long __a, vector bool long long __b) {
+ return (vector bool long long)
+ ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector signed long long __a, vector signed long long __b) {
+ return (vector bool long long)
+ ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
+ return (vector bool long long)
+ ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector float __a, vector float __b) {
+ return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector double __a, vector double __b) {
+ return (vector bool long long)
+ ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+/* vec_cmpnez */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector signed char __a, vector signed char __b) {
+ return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+ (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector unsigned char __a, vector unsigned char __b) {
+ return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+ (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector signed short __a, vector signed short __b) {
+ return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+ (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector unsigned short __a, vector unsigned short __b) {
+ return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+ (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector signed int __a, vector signed int __b) {
+ return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector unsigned int __a, vector unsigned int __b) {
+ return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vctzlsbb(__a);
+#else
+ return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vctzlsbb(__a);
+#else
+ return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vclzlsbb(__a);
+#else
+ return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_altivec_vclzlsbb(__a);
+#else
+ return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned int __a) {
+ return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector signed int __a) {
+ return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned __int128 __a) {
+ return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector signed __int128 __a) {
+ return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned long long __a) {
+ return __builtin_altivec_vprtybd(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector signed long long __a) {
+ return __builtin_altivec_vprtybd(__a);
+}
+
+#endif
+
/* vec_cmpgt */
static __inline__ vector bool char __ATTRS_o_ai
@@ -1882,6 +2203,41 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
return vec_cmpgt(__b, __a);
}
+/* vec_popcnt */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_popcnt(vector signed char __a) {
+ return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_popcnt(vector unsigned char __a) {
+ return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_popcnt(vector signed short __a) {
+ return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_popcnt(vector unsigned short __a) {
+ return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_popcnt(vector signed int __a) {
+ return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_popcnt(vector unsigned int __a) {
+ return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_popcnt(vector signed long long __a) {
+ return __builtin_altivec_vpopcntd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_popcnt(vector unsigned long long __a) {
+ return __builtin_altivec_vpopcntd(__a);
+}
+
/* vec_cntlz */
static __inline__ vector signed char __ATTRS_o_ai
@@ -1918,6 +2274,603 @@ vec_cntlz(vector unsigned long long __a) {
}
#endif
+#ifdef __POWER9_VECTOR__
+
+/* vec_cnttz */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_cnttz(vector signed char __a) {
+ return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_cnttz(vector unsigned char __a) {
+ return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_cnttz(vector signed short __a) {
+ return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_cnttz(vector unsigned short __a) {
+ return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_cnttz(vector signed int __a) {
+ return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_cnttz(vector unsigned int __a) {
+ return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cnttz(vector signed long long __a) {
+ return __builtin_altivec_vctzd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cnttz(vector unsigned long long __a) {
+ return __builtin_altivec_vctzd(__a);
+}
+
+/* vec_first_match_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed char __a, vector signed char __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 3;
+ }
+ return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned char __a, vector unsigned char __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 3;
+ }
+ return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed short __a, vector signed short __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 4;
+ }
+ return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned short __a, vector unsigned short __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 4;
+ }
+ return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed int __a, vector signed int __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 5;
+ }
+ return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned int __a, vector unsigned int __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 5;
+ }
+ return __res[0] >> 5;
+}
+
+/* vec_first_match_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed char __a, vector signed char __b) {
+ /* Compare the result of the comparison of two vectors with either and OR the
+ result. Either the elements are equal or one will equal the comparison
+ result if either is zero.
+ */
+ vector bool char __tmp1 = vec_cmpeq(__a, __b);
+ vector bool char __tmp2 = __tmp1 |
+ vec_cmpeq((vector signed char)__tmp1, __a) |
+ vec_cmpeq((vector signed char)__tmp1, __b);
+
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)__tmp2);
+#else
+ vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 3;
+ }
+ return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned char __a,
+ vector unsigned char __b) {
+ vector bool char __tmp1 = vec_cmpeq(__a, __b);
+ vector bool char __tmp2 = __tmp1 |
+ vec_cmpeq((vector unsigned char)__tmp1, __a) |
+ vec_cmpeq((vector unsigned char)__tmp1, __b);
+
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)__tmp2);
+#else
+ vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 3;
+ }
+ return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed short __a, vector signed short __b) {
+ vector bool short __tmp1 = vec_cmpeq(__a, __b);
+ vector bool short __tmp2 = __tmp1 |
+ vec_cmpeq((vector signed short)__tmp1, __a) |
+ vec_cmpeq((vector signed short)__tmp1, __b);
+
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)__tmp2);
+#else
+ vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 4;
+ }
+ return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned short __a,
+ vector unsigned short __b) {
+ vector bool short __tmp1 = vec_cmpeq(__a, __b);
+ vector bool short __tmp2 = __tmp1 |
+ vec_cmpeq((vector unsigned short)__tmp1, __a) |
+ vec_cmpeq((vector unsigned short)__tmp1, __b);
+
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)__tmp2);
+#else
+ vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 4;
+ }
+ return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed int __a, vector signed int __b) {
+ vector bool int __tmp1 = vec_cmpeq(__a, __b);
+ vector bool int __tmp2 = __tmp1 | vec_cmpeq((vector signed int)__tmp1, __a) |
+ vec_cmpeq((vector signed int)__tmp1, __b);
+
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)__tmp2);
+#else
+ vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 5;
+ }
+ return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned int __a, vector unsigned int __b) {
+ vector bool int __tmp1 = vec_cmpeq(__a, __b);
+ vector bool int __tmp2 = __tmp1 |
+ vec_cmpeq((vector unsigned int)__tmp1, __a) |
+ vec_cmpeq((vector unsigned int)__tmp1, __b);
+
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)__tmp2);
+#else
+ vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 5;
+ }
+ return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed char __a, vector signed char __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 3;
+ }
+ return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned char __a, vector unsigned char __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 3;
+ }
+ return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed short __a, vector signed short __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 4;
+ }
+ return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned short __a, vector unsigned short __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 4;
+ }
+ return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed int __a, vector signed int __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 5;
+ }
+ return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned int __a, vector unsigned int __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 5;
+ }
+ return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed char __a,
+ vector signed char __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 3;
+ }
+ return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned char __a,
+ vector unsigned char __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 3;
+ }
+ return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed short __a,
+ vector signed short __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 4;
+ }
+ return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned short __a,
+ vector unsigned short __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 4;
+ }
+ return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed int __a, vector signed int __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 5;
+ }
+ return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned int __a,
+ vector unsigned int __b) {
+ vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+ vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+ vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+ if (__res[0] == 64) {
+ return (__res[1] + 64) >> 5;
+ }
+ return __res[0] >> 5;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_insert_exp(vector double __a, vector unsigned long long __b) {
+ return __builtin_vsx_xviexpdp((vector unsigned long long)__a,__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_insert_exp(vector unsigned long long __a, vector unsigned long long __b) {
+ return __builtin_vsx_xviexpdp(__a,__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_insert_exp(vector float __a, vector unsigned int __b) {
+ return __builtin_vsx_xviexpsp((vector unsigned int)__a,__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_insert_exp(vector unsigned int __a, vector unsigned int __b) {
+ return __builtin_vsx_xviexpsp(__a,__b);
+}
+
+#if defined(__powerpc64__)
+static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a,
+ size_t __b) {
+ return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_len(unsigned char *__a, size_t __b) {
+ return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a,
+ size_t __b) {
+ return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_len(unsigned short *__a, size_t __b) {
+ return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a,
+ size_t __b) {
+ return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a,
+ size_t __b) {
+ return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) {
+ return (vector float)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_len(signed __int128 *__a, size_t __b) {
+ return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_len(unsigned __int128 *__a, size_t __b) {
+ return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_len(signed long long *__a, size_t __b) {
+ return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_len(unsigned long long *__a, size_t __b) {
+ return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
+ size_t __b) {
+ return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len_r(unsigned char *__a,
+ size_t __b) {
+ vector unsigned char __res =
+ (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
+#ifdef __LITTLE_ENDIAN__
+ vector unsigned char __mask =
+ (vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
+ __res = (vector unsigned char)__builtin_altivec_vperm_4si(
+ (vector int)__res, (vector int)__res, __mask);
+#endif
+ return __res;
+}
+
+// vec_xst_len
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned char __a,
+ unsigned char *__b,
+ size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed char __a,
+ signed char *__b, size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed short __a,
+ signed short *__b, size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned short __a,
+ unsigned short *__b,
+ size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed int __a,
+ signed int *__b, size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned int __a,
+ unsigned int *__b, size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector float __a, float *__b,
+ size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed __int128 __a,
+ signed __int128 *__b,
+ size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned __int128 __a,
+ unsigned __int128 *__b,
+ size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed long long __a,
+ signed long long *__b,
+ size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned long long __a,
+ unsigned long long *__b,
+ size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b,
+ size_t __c) {
+ return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
+ unsigned char *__b,
+ size_t __c) {
+#ifdef __LITTLE_ENDIAN__
+ vector unsigned char __mask =
+ (vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
+ vector unsigned char __res =
+ __builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
+ return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
+#else
+ return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56));
+#endif
+}
+#endif
+#endif
+
/* vec_cpsgn */
#ifdef __VSX__
@@ -2016,20 +2969,284 @@ vec_vctuxs(vector float __a, int __b) {
return __builtin_altivec_vctuxs(__a, __b);
}
+/* vec_signed */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sld(vector signed int, vector signed int, unsigned const int __c);
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signed(vector float __a) {
+ return __builtin_convertvector(__a, vector signed int);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_signed(vector double __a) {
+ return __builtin_convertvector(__a, vector signed long long);
+}
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_signed2(vector double __a, vector double __b) {
+ return (vector signed int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+ vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+ return vec_sld(__ret, __ret, 12);
+#else
+ return __builtin_vsx_xvcvdpsxws(__a);
+#endif
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvdpsxws(__a);
+#else
+ vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+ return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_unsigned */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sld(vector unsigned int, vector unsigned int, unsigned const int __c);
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsigned(vector float __a) {
+ return __builtin_convertvector(__a, vector unsigned int);
+}
+
+#ifdef __VSX__
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_unsigned(vector double __a) {
+ return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_unsigned2(vector double __a, vector double __b) {
+ return (vector unsigned int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+ vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+ return vec_sld(__ret, __ret, 12);
+#else
+ return __builtin_vsx_xvcvdpuxws(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvdpuxws(__a);
+#else
+ vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+ return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_float */
+
+static __inline__ vector float __ATTRS_o_ai
+vec_sld(vector float, vector float, unsigned const int __c);
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector signed int __a) {
+ return __builtin_convertvector(__a, vector float);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector unsigned int __a) {
+ return __builtin_convertvector(__a, vector float);
+}
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector signed long long __a, vector signed long long __b) {
+ return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector unsigned long long __a, vector unsigned long long __b) {
+ return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector double __a, vector double __b) {
+ return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+ vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+ return vec_sld(__ret, __ret, 12);
+#else
+ return __builtin_vsx_xvcvsxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+ vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+ return vec_sld(__ret, __ret, 12);
+#else
+ return __builtin_vsx_xvcvuxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+ vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+ return vec_sld(__ret, __ret, 12);
+#else
+ return __builtin_vsx_xvcvdpsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvsxdsp(__a);
+#else
+ vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+ return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvuxdsp(__a);
+#else
+ vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+ return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvdpsp(__a);
+#else
+ vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+ return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
/* vec_double */
#ifdef __VSX__
static __inline__ vector double __ATTRS_o_ai
vec_double(vector signed long long __a) {
+ return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector unsigned long long __a) {
+ return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#else
+ return __builtin_vsx_xvcvsxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#else
+ return __builtin_vsx_xvcvuxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#else
+ return __builtin_vsx_xvcvspdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector signed int __a) {
vector double __ret = {__a[0], __a[1]};
return __ret;
}
static __inline__ vector double __ATTRS_o_ai
-vec_double(vector unsigned long long __a) {
+vec_doubleh(vector unsigned int __a) {
vector double __ret = {__a[0], __a[1]};
return __ret;
}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector float __a) {
+ vector double __ret = {__a[0], __a[1]};
+ return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector signed int __a) {
+ vector double __ret = {__a[2], __a[3]};
+ return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector unsigned int __a) {
+ vector double __ret = {__a[2], __a[3]};
+ return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector float __a) {
+ vector double __ret = {__a[2], __a[3]};
+ return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvsxwdp(__a);
+#else
+ return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvuxwdp(__a);
+#else
+ return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+ return __builtin_vsx_xvcvspdp(__a);
+#else
+ return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#endif
+}
#endif
/* vec_div */
@@ -3835,6 +5052,34 @@ vec_mergee(vector unsigned int __a, vector unsigned int __b) {
0x18, 0x19, 0x1A, 0x1B));
}
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergee(vector bool long long __a, vector bool long long __b) {
+ return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergee(vector signed long long __a, vector signed long long __b) {
+ return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergee(vector unsigned long long __a, vector unsigned long long __b) {
+ return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergee(vector float __a, vector float __b) {
+ return vec_perm(__a, __b,
+ (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+ 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+ 0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergee(vector double __a, vector double __b) {
+ return vec_mergeh(__a, __b);
+}
+
/* vec_mergeo */
static __inline__ vector bool int __ATTRS_o_ai vec_mergeo(vector bool int __a,
@@ -3861,6 +5106,34 @@ vec_mergeo(vector unsigned int __a, vector unsigned int __b) {
0x1C, 0x1D, 0x1E, 0x1F));
}
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergeo(vector bool long long __a, vector bool long long __b) {
+ return vec_mergel(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeo(vector signed long long __a, vector signed long long __b) {
+ return vec_mergel(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeo(vector unsigned long long __a, vector unsigned long long __b) {
+ return vec_mergel(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergeo(vector float __a, vector float __b) {
+ return vec_perm(__a, __b,
+ (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+ 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeo(vector double __a, vector double __b) {
+ return vec_mergel(__a, __b);
+}
+
#endif
/* vec_mfvscr */
@@ -4689,6 +5962,12 @@ static __inline__ vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
return ~(__a & __b);
}
+static __inline__ vector float __ATTRS_o_ai
+vec_nand(vector float __a, vector float __b) {
+ return (vector float)(~((vector unsigned int)__a &
+ (vector unsigned int)__b));
+}
+
static __inline__ vector signed long long __ATTRS_o_ai
vec_nand(vector signed long long __a, vector signed long long __b) {
return ~(__a & __b);
@@ -4724,6 +6003,12 @@ vec_nand(vector bool long long __a, vector bool long long __b) {
return ~(__a & __b);
}
+static __inline__ vector double __ATTRS_o_ai
+vec_nand(vector double __a, vector double __b) {
+ return (vector double)(~((vector unsigned long long)__a &
+ (vector unsigned long long)__b));
+}
+
#endif
/* vec_nmadd */
@@ -5195,6 +6480,16 @@ static __inline__ vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
return __a | ~__b;
}
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector bool int __a, vector float __b) {
+ return (vector float)(__a | ~(vector unsigned int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector float __a, vector bool int __b) {
+ return (vector float)((vector unsigned int)__a | ~__b);
+}
+
static __inline__ vector signed long long __ATTRS_o_ai
vec_orc(vector signed long long __a, vector signed long long __b) {
return __a | ~__b;
@@ -5229,6 +6524,16 @@ static __inline__ vector bool long long __ATTRS_o_ai
vec_orc(vector bool long long __a, vector bool long long __b) {
return __a | ~__b;
}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector double __a, vector bool long long __b) {
+ return (vector double)((vector unsigned long long)__a | ~__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector double __b) {
+ return (vector double)(__a | ~(vector unsigned long long)__b);
+}
#endif
/* vec_vor */
@@ -5536,8 +6841,25 @@ vec_pack(vector bool long long __a, vector bool long long __b) {
#endif
}
+static __inline__ vector float __ATTRS_o_ai
+vec_pack(vector double __a, vector double __b) {
+ return (vector float) (__a[0], __a[1], __b[0], __b[1]);
+}
#endif
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_pack_to_short_fp32(vector float __a, vector float __b) {
+ vector float __resa = __builtin_vsx_xvcvsphp(__a);
+ vector float __resb = __builtin_vsx_xvcvsphp(__b);
+#ifdef __LITTLE_ENDIAN__
+ return (vector unsigned short)vec_mergee(__resa, __resb);
+#else
+ return (vector unsigned short)vec_mergeo(__resa, __resb);
+#endif
+}
+
+#endif
/* vec_vpkuhum */
#define __builtin_altivec_vpkuhum vec_vpkuhum
@@ -6324,6 +7646,34 @@ vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
}
#endif
+/* vec_rlmi */
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlmi(vector unsigned int __a, vector unsigned int __b,
+ vector unsigned int __c) {
+ return __builtin_altivec_vrlwmi(__a, __c, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlmi(vector unsigned long long __a, vector unsigned long long __b,
+ vector unsigned long long __c) {
+ return __builtin_altivec_vrldmi(__a, __c, __b);
+}
+
+/* vec_rlnm */
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlnm(vector unsigned int __a, vector unsigned int __b,
+ vector unsigned int __c) {
+ return __builtin_altivec_vrlwnm(__a, __b) & __c;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlnm(vector unsigned long long __a, vector unsigned long long __b,
+ vector unsigned long long __c) {
+ return __builtin_altivec_vrldnm(__a, __b) & __c;
+}
+#endif
+
/* vec_vrlb */
static __inline__ vector signed char __ATTRS_o_ai
@@ -6984,6 +8334,145 @@ static __inline__ vector float __ATTRS_o_ai vec_sld(vector float __a,
#endif
}
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_sld(vector bool long long __a, vector bool long long __b,
+ unsigned const int __c) {
+ unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+ return vec_perm(
+ __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+ 20 - __d, 21 - __d, 22 - __d, 23 - __d,
+ 24 - __d, 25 - __d, 26 - __d, 27 - __d,
+ 28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+ return vec_perm(
+ __a, __b,
+ (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+ __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+ __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sld(vector signed long long __a, vector signed long long __b,
+ unsigned const int __c) {
+ unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+ return vec_perm(
+ __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+ 20 - __d, 21 - __d, 22 - __d, 23 - __d,
+ 24 - __d, 25 - __d, 26 - __d, 27 - __d,
+ 28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+ return vec_perm(
+ __a, __b,
+ (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+ __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+ __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sld(vector unsigned long long __a, vector unsigned long long __b,
+ unsigned const int __c) {
+ unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+ return vec_perm(
+ __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+ 20 - __d, 21 - __d, 22 - __d, 23 - __d,
+ 24 - __d, 25 - __d, 26 - __d, 27 - __d,
+ 28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+ return vec_perm(
+ __a, __b,
+ (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+ __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+ __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sld(vector double __a,
+ vector double __b,
+ unsigned const int __c) {
+ unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+ return vec_perm(
+ __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+ 20 - __d, 21 - __d, 22 - __d, 23 - __d,
+ 24 - __d, 25 - __d, 26 - __d, 27 - __d,
+ 28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+ return vec_perm(
+ __a, __b,
+ (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+ __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+ __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+#endif
+
+/* vec_sldw */
+static __inline__ vector signed char __ATTRS_o_ai vec_sldw(
+ vector signed char __a, vector signed char __b, unsigned const int __c) {
+ return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sldw(vector unsigned char __a, vector unsigned char __b,
+ unsigned const int __c) {
+ return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_sldw(
+ vector signed short __a, vector signed short __b, unsigned const int __c) {
+ return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sldw(vector unsigned short __a, vector unsigned short __b,
+ unsigned const int __c) {
+ return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sldw(vector signed int __a, vector signed int __b, unsigned const int __c) {
+ return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sldw(
+ vector unsigned int __a, vector unsigned int __b, unsigned const int __c) {
+ return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sldw(vector signed long long __a, vector signed long long __b,
+ unsigned const int __c) {
+ return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b,
+ unsigned const int __c) {
+ return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+/* vec_slv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slv(vector unsigned char __a, vector unsigned char __b) {
+ return __builtin_altivec_vslv(__a, __b);
+}
+
+/* vec_srv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srv(vector unsigned char __a, vector unsigned char __b) {
+ return __builtin_altivec_vsrv(__a, __b);
+}
+#endif
+
/* vec_vsldoi */
static __inline__ vector signed char __ATTRS_o_ai
@@ -7307,6 +8796,20 @@ vec_sll(vector bool int __a, vector unsigned int __b) {
(vector int)__b);
}
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+ return (vector signed long long)__builtin_altivec_vsl((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+ return (vector unsigned long long)__builtin_altivec_vsl((vector int)__a,
+ (vector int)__b);
+}
+#endif
+
/* vec_vsl */
static __inline__ vector signed char __ATTRS_o_ai
@@ -7570,6 +9073,32 @@ static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a,
return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
}
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector signed char __b) {
+ return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector unsigned char __b) {
+ return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector signed char __b) {
+ return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector unsigned char __b) {
+ return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+ (vector int)__b);
+}
+#endif
+
/* vec_vslo */
static __inline__ vector signed char __ATTRS_o_ai
@@ -8304,6 +9833,20 @@ vec_srl(vector bool int __a, vector unsigned int __b) {
(vector int)__b);
}
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+ return (vector signed long long)__builtin_altivec_vsr((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+ return (vector unsigned long long)__builtin_altivec_vsr((vector int)__a,
+ (vector int)__b);
+}
+#endif
+
/* vec_vsr */
static __inline__ vector signed char __ATTRS_o_ai
@@ -8567,6 +10110,32 @@ static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a,
return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
}
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector signed char __b) {
+ return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector unsigned char __b) {
+ return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector signed char __b) {
+ return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+ (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector unsigned char __b) {
+ return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+ (vector int)__b);
+}
+#endif
+
/* vec_vsro */
static __inline__ vector signed char __ATTRS_o_ai
@@ -9580,6 +11149,12 @@ vec_vsubfp(vector float __a, vector float __b) {
/* vec_subc */
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subc(vector signed int __a, vector signed int __b) {
+ return (vector signed int)__builtin_altivec_vsubcuw((vector unsigned int)__a,
+ (vector unsigned int) __b);
+}
+
static __inline__ vector unsigned int __ATTRS_o_ai
vec_subc(vector unsigned int __a, vector unsigned int __b) {
return __builtin_altivec_vsubcuw(__a, __b);
@@ -9813,6 +11388,7 @@ vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
/* vec_vsubeuqm */
+
static __inline__ vector signed __int128 __ATTRS_o_ai
vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
vector signed __int128 __c) {
@@ -9825,6 +11401,18 @@ vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
return __builtin_altivec_vsubeuqm(__a, __b, __c);
}
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sube(vector signed __int128 __a, vector signed __int128 __b,
+ vector signed __int128 __c) {
+ return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sube(vector unsigned __int128 __a, vector unsigned __int128 __b,
+ vector unsigned __int128 __c) {
+ return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
/* vec_vsubcuq */
static __inline__ vector signed __int128 __ATTRS_o_ai
@@ -9850,8 +11438,47 @@ vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
vector unsigned __int128 __c) {
return __builtin_altivec_vsubecuq(__a, __b, __c);
}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subec(vector signed int __a, vector signed int __b,
+ vector signed int __c) {
+ return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subec(vector unsigned int __a, vector unsigned int __b,
+ vector unsigned int __c) {
+ return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_subec(vector signed __int128 __a, vector signed __int128 __b,
+ vector signed __int128 __c) {
+ return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_subec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+ vector unsigned __int128 __c) {
+ return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sube(vector signed int __a, vector signed int __b,
+ vector signed int __c) {
+ vector signed int __mask = {1, 1, 1, 1};
+ vector signed int __carry = __c & __mask;
+ return vec_adde(__a, ~__b, __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sube(vector unsigned int __a, vector unsigned int __b,
+ vector unsigned int __c) {
+ vector unsigned int __mask = {1, 1, 1, 1};
+ vector unsigned int __carry = __c & __mask;
+ return vec_adde(__a, ~__b, __carry);
+}
/* vec_sum4s */
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
@@ -10051,6 +11678,11 @@ vec_unpackh(vector bool int __a) {
return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
#endif
}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackh(vector float __a) {
+ return (vector double)(__a[0], __a[1]);
+}
#endif
/* vec_vupkhsb */
@@ -10185,6 +11817,11 @@ vec_unpackl(vector bool int __a) {
return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
#endif
}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackl(vector float __a) {
+ return (vector double)(__a[2], __a[3]);
+}
#endif
/* vec_vupklsb */
@@ -10935,6 +12572,55 @@ static __inline__ float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
return __a[__b];
}
+#ifdef __POWER9_VECTOR__
+
+/* vec_extract_exp */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_exp(vector float __a) {
+ return __builtin_vsx_xvxexpsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_exp(vector double __a) {
+ return __builtin_vsx_xvxexpdp(__a);
+}
+
+/* vec_extract_sig */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_sig(vector float __a) {
+ return __builtin_vsx_xvxsigsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_sig (vector double __a) {
+ return __builtin_vsx_xvxsigdp(__a);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shorth(vector unsigned short __a) {
+ vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+ __builtin_shufflevector(__a, __a, 0, -1, 1, -1, 2, -1, 3, -1);
+#else
+ __builtin_shufflevector(__a, __a, -1, 0, -1, 1, -1, 2, -1, 3);
+#endif
+ return __builtin_vsx_xvcvhpsp(__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shortl(vector unsigned short __a) {
+ vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+ __builtin_shufflevector(__a, __a, 4, -1, 5, -1, 6, -1, 7, -1);
+#else
+ __builtin_shufflevector(__a, __a, -1, 4, -1, 5, -1, 6, -1, 7);
+#endif
+ return __builtin_vsx_xvcvhpsp(__b);
+}
+#endif /* __POWER9_VECTOR__ */
+
/* vec_insert */
static __inline__ vector signed char __ATTRS_o_ai
@@ -14369,6 +16055,24 @@ __builtin_crypto_vncipherlast(vector unsigned long long __a,
#endif
#ifdef __POWER8_VECTOR__
+static __inline__ vector bool char __ATTRS_o_ai
+vec_permxor(vector bool char __a, vector bool char __b,
+ vector bool char __c) {
+ return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_permxor(vector signed char __a, vector signed char __b,
+ vector signed char __c) {
+ return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_permxor(vector unsigned char __a, vector unsigned char __b,
+ vector unsigned char __c) {
+ return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
static __inline__ vector unsigned char __ATTRS_o_ai
__builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b,
vector unsigned char __c) {
@@ -14453,6 +16157,572 @@ vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
#endif
#endif
+
+/* vec_reve */
+
+static inline __ATTRS_o_ai vector bool char vec_reve(vector bool char __a) {
+ return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+ 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed char vec_reve(vector signed char __a) {
+ return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+ 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_reve(vector unsigned char __a) {
+ return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+ 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool int vec_reve(vector bool int __a) {
+ return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_reve(vector signed int __a) {
+ return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_reve(vector unsigned int __a) {
+ return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool short vec_reve(vector bool short __a) {
+ return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_reve(vector signed short __a) {
+ return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_reve(vector unsigned short __a) {
+ return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector float vec_reve(vector float __a) {
+ return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector bool long long
+vec_reve(vector bool long long __a) {
+ return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_reve(vector signed long long __a) {
+ return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_reve(vector unsigned long long __a) {
+ return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector double vec_reve(vector double __a) {
+ return __builtin_shufflevector(__a, __a, 1, 0);
+}
+#endif
+
+/* vec_revb */
+static __inline__ vector bool char __ATTRS_o_ai
+vec_revb(vector bool char __a) {
+ return __a;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_revb(vector signed char __a) {
+ return __a;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_revb(vector unsigned char __a) {
+ return __a;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_revb(vector bool short __a) {
+ vector unsigned char __indices =
+ { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_revb(vector signed short __a) {
+ vector unsigned char __indices =
+ { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_revb(vector unsigned short __a) {
+ vector unsigned char __indices =
+ { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_revb(vector bool int __a) {
+ vector unsigned char __indices =
+ { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_revb(vector signed int __a) {
+ vector unsigned char __indices =
+ { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_revb(vector unsigned int __a) {
+ vector unsigned char __indices =
+ { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_revb(vector float __a) {
+ vector unsigned char __indices =
+ { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return vec_perm(__a, __a, __indices);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_revb(vector bool long long __a) {
+ vector unsigned char __indices =
+ { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_revb(vector signed long long __a) {
+ vector unsigned char __indices =
+ { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_revb(vector unsigned long long __a) {
+ vector unsigned char __indices =
+ { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_revb(vector double __a) {
+ vector unsigned char __indices =
+ { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ return vec_perm(__a, __a, __indices);
+}
+#endif /* End __VSX__ */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_revb(vector signed __int128 __a) {
+ vector unsigned char __indices =
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+ return (vector signed __int128)vec_perm((vector signed int)__a,
+ (vector signed int)__a,
+ __indices);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_revb(vector unsigned __int128 __a) {
+ vector unsigned char __indices =
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+ return (vector unsigned __int128)vec_perm((vector signed int)__a,
+ (vector signed int)__a,
+ __indices);
+}
+#endif /* END __POWER8_VECTOR__ && __powerpc64__ */
+
+/* vec_xl */
+
+static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset,
+ signed char *__ptr) {
+ return *(vector signed char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xl(signed long long __offset, unsigned char *__ptr) {
+ return *(vector unsigned char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
+ signed short *__ptr) {
+ return *(vector signed short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xl(signed long long __offset, unsigned short *__ptr) {
+ return *(vector unsigned short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
+ signed int *__ptr) {
+ return *(vector signed int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
+ unsigned int *__ptr) {
+ return *(vector unsigned int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
+ float *__ptr) {
+ return *(vector float *)(__ptr + __offset);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector signed long long
+vec_xl(signed long long __offset, signed long long *__ptr) {
+ return *(vector signed long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xl(signed long long __offset, unsigned long long *__ptr) {
+ return *(vector unsigned long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
+ double *__ptr) {
+ return *(vector double *)(__ptr + __offset);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai vector signed __int128
+vec_xl(signed long long __offset, signed __int128 *__ptr) {
+ return *(vector signed __int128 *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned __int128
+vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
+ return *(vector unsigned __int128 *)(__ptr + __offset);
+}
+#endif
+
+/* vec_xl_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed char *__ptr) {
+ vector signed char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+ return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+ 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned char *__ptr) {
+ vector unsigned char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+ return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+ 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed short *__ptr) {
+ vector signed short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+ return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned short *__ptr) {
+ vector unsigned short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+ return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed int *__ptr) {
+ return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned int *__ptr) {
+ return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_xl_be(signed long long __offset, float *__ptr) {
+ return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed long long *__ptr) {
+ return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned long long *__ptr) {
+ return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_xl_be(signed long long __offset, double *__ptr) {
+ return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed __int128 *__ptr) {
+ return vec_xl(__offset, __ptr);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned __int128 *__ptr) {
+ return vec_xl(__offset, __ptr);
+}
+#endif
+#else
+ #define vec_xl_be vec_xl
+#endif
+
+/* vec_xst */
+
+static inline __ATTRS_o_ai void vec_xst(vector signed char __vec,
+ signed long long __offset,
+ signed char *__ptr) {
+ *(vector signed char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned char __vec,
+ signed long long __offset,
+ unsigned char *__ptr) {
+ *(vector unsigned char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed short __vec,
+ signed long long __offset,
+ signed short *__ptr) {
+ *(vector signed short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec,
+ signed long long __offset,
+ unsigned short *__ptr) {
+ *(vector unsigned short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed int __vec,
+ signed long long __offset,
+ signed int *__ptr) {
+ *(vector signed int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned int __vec,
+ signed long long __offset,
+ unsigned int *__ptr) {
+ *(vector unsigned int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector float __vec,
+ signed long long __offset,
+ float *__ptr) {
+ *(vector float *)(__ptr + __offset) = __vec;
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec,
+ signed long long __offset,
+ signed long long *__ptr) {
+ *(vector signed long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec,
+ signed long long __offset,
+ unsigned long long *__ptr) {
+ *(vector unsigned long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector double __vec,
+ signed long long __offset,
+ double *__ptr) {
+ *(vector double *)(__ptr + __offset) = __vec;
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec,
+ signed long long __offset,
+ signed __int128 *__ptr) {
+ *(vector signed __int128 *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec,
+ signed long long __offset,
+ unsigned __int128 *__ptr) {
+ *(vector unsigned __int128 *)(__ptr + __offset) = __vec;
+}
+#endif
+
+/* vec_xst_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed char __vec,
+ signed long long __offset,
+ signed char *__ptr) {
+ vector signed char __tmp =
+ __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+ 13, 12, 11, 10, 9, 8);
+ __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
+ signed long long __offset,
+ unsigned char *__ptr) {
+ vector unsigned char __tmp =
+ __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+ 13, 12, 11, 10, 9, 8);
+ __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
+ signed long long __offset,
+ signed short *__ptr) {
+ vector signed short __tmp =
+ __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+ __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
+ signed long long __offset,
+ unsigned short *__ptr) {
+ vector unsigned short __tmp =
+ __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+ __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
+ signed long long __offset,
+ signed int *__ptr) {
+ __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned int __vec,
+ signed long long __offset,
+ unsigned int *__ptr) {
+ __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector float __vec,
+ signed long long __offset,
+ float *__ptr) {
+ __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed long long __vec,
+ signed long long __offset,
+ signed long long *__ptr) {
+ __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned long long __vec,
+ signed long long __offset,
+ unsigned long long *__ptr) {
+ __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector double __vec,
+ signed long long __offset,
+ double *__ptr) {
+ __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed __int128 __vec,
+ signed long long __offset,
+ signed __int128 *__ptr) {
+ vec_xst(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned __int128 __vec,
+ signed long long __offset,
+ unsigned __int128 *__ptr) {
+ vec_xst(__vec, __offset, __ptr);
+}
+#endif
+#else
+ #define vec_xst_be vec_xst
+#endif
+
+#ifdef __POWER9_VECTOR__
+#define vec_test_data_class(__a, __b) \
+ _Generic((__a), \
+ vector float: \
+ (vector bool int)__builtin_vsx_xvtstdcsp((__a), (__b)), \
+ vector double: \
+ (vector bool long long)__builtin_vsx_xvtstdcdp((__a), (__b)) \
+ )
+
+#endif /* #ifdef __POWER9_VECTOR__ */
+
+static vector float __ATTRS_o_ai vec_neg(vector float __a) {
+ return -__a;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_neg(vector double __a) {
+ return -__a;
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_neg(vector long long __a) {
+ return -__a;
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_neg(vector signed int __a) {
+ return -__a;
+}
+
+static vector signed short __ATTRS_o_ai vec_neg(vector signed short __a) {
+ return -__a;
+}
+
+static vector signed char __ATTRS_o_ai vec_neg(vector signed char __a) {
+ return -__a;
+}
+
+static vector float __ATTRS_o_ai vec_nabs(vector float __a) {
+ return - vec_abs(__a);
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_nabs(vector double __a) {
+ return - vec_abs(__a);
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_nabs(vector long long __a) {
+ return __builtin_altivec_vminsd(__a, -__a);
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_nabs(vector signed int __a) {
+ return __builtin_altivec_vminsw(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai vec_nabs(vector signed short __a) {
+ return __builtin_altivec_vminsh(__a, -__a);
+}
+
+static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
+ return __builtin_altivec_vminsb(__a, -__a);
+}
#undef __ATTRS_o_ai
#endif /* __ALTIVEC_H */
diff --git a/lib/Headers/ammintrin.h b/lib/Headers/ammintrin.h
index 8985bb404f47..2843a7a2677f 100644
--- a/lib/Headers/ammintrin.h
+++ b/lib/Headers/ammintrin.h
@@ -30,7 +30,7 @@
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
-/// integer vector operand at the index idx and of the length len.
+/// integer vector operand at the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
@@ -38,7 +38,7 @@
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
/// \endcode
///
-/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param x
/// The value from which bits are extracted.
@@ -49,8 +49,8 @@
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
-/// [63:0] of parameter x are extracted. If the length is zero but the index
-/// is non-zero, the result is undefined.
+/// [63:0] of parameter \a x are extracted. If the length is zero but the
+/// index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
/// extracted from the source operand.
#define _mm_extracti_si64(x, len, idx) \
@@ -58,11 +58,12 @@
(char)(len), (char)(idx)))
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
-/// integer vector operand at the index and of the length specified by __y.
+/// integer vector operand at the index and of the length specified by
+/// \a __y.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c EXTRQ instruction.
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param __x
/// The value from which bits are extracted.
@@ -71,8 +72,8 @@
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
/// length is interpreted as 64. If the sum of the index and length is
/// greater than 64, the result is undefined. If the length and index are
-/// both zero, bits [63:0] of parameter __x are extracted. If the length is
-/// zero but the index is non-zero, the result is undefined.
+/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
+/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
/// from the source operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -81,9 +82,9 @@ _mm_extract_si64(__m128i __x, __m128i __y)
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
}
-/// \brief Inserts bits of a specified length from the source integer vector y
-/// into the lower 64 bits of the destination integer vector x at the index
-/// idx and of the length len.
+/// \brief Inserts bits of a specified length from the source integer vector
+/// \a y into the lower 64 bits of the destination integer vector \a x at
+/// the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
@@ -92,15 +93,15 @@ _mm_extract_si64(__m128i __x, __m128i __y)
/// const int idx);
/// \endcode
///
-/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param x
/// The destination operand where bits will be inserted. The inserted bits
-/// are defined by the length len and by the index idx specifying the least
-/// significant bit.
+/// are defined by the length \a len and by the index \a idx specifying the
+/// least significant bit.
/// \param y
/// The source operand containing the bits to be extracted. The extracted
-/// bits are the least significant bits of operand y of length len.
+/// bits are the least significant bits of operand \a y of length \a len.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
@@ -108,45 +109,43 @@ _mm_extract_si64(__m128i __x, __m128i __y)
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
-/// [63:0] of parameter y are inserted into parameter x. If the length is
-/// zero but the index is non-zero, the result is undefined.
+/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
+/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
-/// destination operand x with the specified bitfields replaced by the lower
-/// bits of source operand y. The upper 64 bits of the return value are
-/// undefined.
-
+/// destination operand \a x with the specified bitfields replaced by the
+/// lower bits of source operand \a y. The upper 64 bits of the return value
+/// are undefined.
#define _mm_inserti_si64(x, y, len, idx) \
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
(__v2di)(__m128i)(y), \
(char)(len), (char)(idx)))
/// \brief Inserts bits of a specified length from the source integer vector
-/// __y into the lower 64 bits of the destination integer vector __x at the
-/// index and of the length specified by __y.
+/// \a __y into the lower 64 bits of the destination integer vector \a __x
+/// at the index and of the length specified by \a __y.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c INSERTQ instruction.
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param __x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length and by the index of the least significant bit
-/// specified by operand __y.
+/// specified by operand \a __y.
/// \param __y
/// The source operand containing the bits to be extracted. The extracted
-/// bits are the least significant bits of operand __y with length specified
-/// by bits [69:64]. These are inserted into the destination at the index
-/// specified by bits [77:72]; all other bits are ignored. If bits [69:64]
-/// are zero, the length is interpreted as 64. If the sum of the index and
-/// length is greater than 64, the result is undefined. If the length and
-/// index are both zero, bits [63:0] of parameter __y are inserted into
-/// parameter __x. If the length is zero but the index is non-zero, the
-/// result is undefined.
+/// bits are the least significant bits of operand \a __y with length
+/// specified by bits [69:64]. These are inserted into the destination at the
+/// index specified by bits [77:72]; all other bits are ignored. If bits
+/// [69:64] are zero, the length is interpreted as 64. If the sum of the
+/// index and length is greater than 64, the result is undefined. If the
+/// length and index are both zero, bits [63:0] of parameter \a __y are
+/// inserted into parameter \a __x. If the length is zero but the index is
+/// non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
-/// destination operand __x with the specified bitfields replaced by the
-/// lower bits of source operand __y. The upper 64 bits of the return value
-/// are undefined.
-
+/// destination operand \a __x with the specified bitfields replaced by the
+/// lower bits of source operand \a __y. The upper 64 bits of the return
+/// value are undefined.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_insert_si64(__m128i __x, __m128i __y)
{
@@ -159,7 +158,7 @@ _mm_insert_si64(__m128i __x, __m128i __y)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c MOVNTSD instruction.
+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
///
/// \param __p
/// The 64-bit memory location used to store the register value.
@@ -177,7 +176,7 @@ _mm_stream_sd(double *__p, __m128d __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c MOVNTSS instruction.
+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
///
/// \param __p
/// The 32-bit memory location used to store the register value.
diff --git a/lib/Headers/armintr.h b/lib/Headers/armintr.h
new file mode 100644
index 000000000000..933afcbb91b6
--- /dev/null
+++ b/lib/Headers/armintr.h
@@ -0,0 +1,45 @@
+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <armintr.h>
+#else
+
+#ifndef __ARMINTR_H
+#define __ARMINTR_H
+
+typedef enum
+{
+ _ARM_BARRIER_SY = 0xF,
+ _ARM_BARRIER_ST = 0xE,
+ _ARM_BARRIER_ISH = 0xB,
+ _ARM_BARRIER_ISHST = 0xA,
+ _ARM_BARRIER_NSH = 0x7,
+ _ARM_BARRIER_NSHST = 0x6,
+ _ARM_BARRIER_OSH = 0x3,
+ _ARM_BARRIER_OSHST = 0x2
+} _ARMINTR_BARRIER_TYPE;
+
+#endif /* __ARMINTR_H */
+#endif /* _MSC_VER */
diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h
index d3c5a6c96446..629dc8611a7f 100644
--- a/lib/Headers/avx512bwintrin.h
+++ b/lib/Headers/avx512bwintrin.h
@@ -350,19 +350,17 @@ _mm512_add_epi8 (__m512i __A, __m512i __B) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) __W,
- (__mmask64) __U);
+_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_add_epi8(__A, __B),
+ (__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) _mm512_setzero_qi(),
- (__mmask64) __U);
+_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_add_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_qi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -371,19 +369,17 @@ _mm512_sub_epi8 (__m512i __A, __m512i __B) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) __W,
- (__mmask64) __U);
+_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_sub_epi8(__A, __B),
+ (__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) _mm512_setzero_qi(),
- (__mmask64) __U);
+_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_sub_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_qi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -392,19 +388,17 @@ _mm512_add_epi16 (__m512i __A, __m512i __B) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_add_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) _mm512_setzero_hi(),
- (__mmask32) __U);
+_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_add_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -413,19 +407,17 @@ _mm512_sub_epi16 (__m512i __A, __m512i __B) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sub_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) _mm512_setzero_hi(),
- (__mmask32) __U);
+_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sub_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -434,19 +426,17 @@ _mm512_mullo_epi16 (__m512i __A, __m512i __B) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mullo_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) _mm512_setzero_hi(),
- (__mmask32) __U);
+_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mullo_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1018,31 +1008,25 @@ _mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
+_mm512_shuffle_epi8(__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) _mm512_setzero_qi(),
- (__mmask64) -1);
+ return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
- __m512i __B)
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) __W,
- (__mmask64) __U);
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_shuffle_epi8(__A, __B),
+ (__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) _mm512_setzero_qi(),
- (__mmask64) __U);
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_shuffle_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_qi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1537,55 +1521,49 @@ _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi16 (__m256i __A)
+_mm512_cvtepi8_epi16(__m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ /* This function always performs a signed extension, but __v32qi is a char
+ which may be signed or unsigned, so use __v32qs. */
+ return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtepi8_epi16(__A),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A)
+_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
- (__v32hi)
- _mm512_setzero_hi(),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtepi8_epi16(__A),
+ (__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi16 (__m256i __A)
+_mm512_cvtepu8_epi16(__m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtepu8_epi16(__A),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
+_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
- (__v32hi)
- _mm512_setzero_hi(),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtepu8_epi16(__A),
+ (__v32hi)_mm512_setzero_hi());
}
@@ -1704,79 +1682,70 @@ _mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
(__v32hi)_mm512_setzero_hi()); })
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi16 (__m512i __A, __m512i __B)
+_mm512_sllv_epi16(__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
- __m512i __B)
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sllv_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sllv_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi16 (__m512i __A, __m128i __B)
+_mm512_sll_epi16(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
- __m128i __B)
+_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sll_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sll_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
-#define _mm512_slli_epi16(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
- (__v32hi)_mm512_setzero_hi(), \
- (__mmask32)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi16(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
+}
-#define _mm512_mask_slli_epi16(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
- (__v32hi)(__m512i)(W), \
- (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_slli_epi16(__A, __B),
+ (__v32hi)__W);
+}
-#define _mm512_maskz_slli_epi16(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)(__m512i)(A), (int)(B), \
- (__v32hi)_mm512_setzero_hi(), \
- (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_slli_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
+}
#define _mm512_bslli_epi128(a, imm) __extension__ ({ \
(__m512i)__builtin_shufflevector( \
@@ -1848,155 +1817,136 @@ _mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); })
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srlv_epi16 (__m512i __A, __m512i __B)
+_mm512_srlv_epi16(__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
- __m512i __B)
+_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srlv_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srlv_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi16 (__m512i __A, __m512i __B)
+_mm512_srav_epi16(__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
- __m512i __B)
+_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srav_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srav_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi16 (__m512i __A, __m128i __B)
+_mm512_sra_epi16(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
- __m128i __B)
+_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sra_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sra_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
-#define _mm512_srai_epi16(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
- (__v32hi)_mm512_setzero_hi(), \
- (__mmask32)-1); })
-
-#define _mm512_mask_srai_epi16(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
- (__v32hi)(__m512i)(W), \
- (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi16(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+}
-#define _mm512_maskz_srai_epi16(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \
- (__v32hi)_mm512_setzero_hi(), \
- (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srai_epi16(__A, __B),
+ (__v32hi)__W);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srai_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
+}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi16 (__m512i __A, __m128i __B)
+_mm512_srl_epi16(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
- __m128i __B)
+_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srl_epi16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
- (__v8hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srl_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
}
-#define _mm512_srli_epi16(A, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
- (__v32hi)_mm512_setzero_hi(), \
- (__mmask32)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi16(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
+}
-#define _mm512_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
- (__v32hi)(__m512i)(W), \
- (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srli_epi16(__A, __B),
+ (__v32hi)__W);
+}
-#define _mm512_maskz_srli_epi16(U, A, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(imm), \
- (__v32hi)_mm512_setzero_hi(), \
- (__mmask32)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srli_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_hi());
+}
#define _mm512_bsrli_epi128(a, imm) __extension__ ({ \
(__m512i)__builtin_shufflevector( \
diff --git a/lib/Headers/avx512dqintrin.h b/lib/Headers/avx512dqintrin.h
index 13665e4c6668..ae44b98a9495 100644
--- a/lib/Headers/avx512dqintrin.h
+++ b/lib/Headers/avx512dqintrin.h
@@ -37,204 +37,169 @@ _mm512_mullo_epi64 (__m512i __A, __m512i __B) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_mullo_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) {
- return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_mullo_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_xor_pd (__m512d __A, __m512d __B) {
- return (__m512d) ((__v8du) __A ^ (__v8du) __B);
+_mm512_xor_pd(__m512d __A, __m512d __B) {
+ return (__m512d)((__v8du)__A ^ (__v8du)__B);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U);
+_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_xor_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
+_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_xor_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_xor_ps (__m512 __A, __m512 __B) {
- return (__m512) ((__v16su) __A ^ (__v16su) __B);
+ return (__m512)((__v16su)__A ^ (__v16su)__B);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U);
+_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_xor_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
+_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_xor_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_or_pd (__m512d __A, __m512d __B) {
- return (__m512d) ((__v8du) __A | (__v8du) __B);
+_mm512_or_pd(__m512d __A, __m512d __B) {
+ return (__m512d)((__v8du)__A | (__v8du)__B);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U);
+_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_or_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
+_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_or_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_or_ps (__m512 __A, __m512 __B) {
- return (__m512) ((__v16su) __A | (__v16su) __B);
+_mm512_or_ps(__m512 __A, __m512 __B) {
+ return (__m512)((__v16su)__A | (__v16su)__B);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U);
+_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_or_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
+_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_or_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_and_pd (__m512d __A, __m512d __B) {
- return (__m512d) ((__v8du) __A & (__v8du) __B);
+_mm512_and_pd(__m512d __A, __m512d __B) {
+ return (__m512d)((__v8du)__A & (__v8du)__B);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U);
+_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_and_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
+_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_and_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_and_ps (__m512 __A, __m512 __B) {
- return (__m512) ((__v16su) __A & (__v16su) __B);
+_mm512_and_ps(__m512 __A, __m512 __B) {
+ return (__m512)((__v16su)__A & (__v16su)__B);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U);
+_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_and_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
+_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_and_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_andnot_pd (__m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) -1);
+_mm512_andnot_pd(__m512d __A, __m512d __B) {
+ return (__m512d)(~(__v8du)__A & (__v8du)__B);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U);
+_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_andnot_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
+_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_andnot_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_andnot_ps (__m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) -1);
+_mm512_andnot_ps(__m512 __A, __m512 __B) {
+ return (__m512)(~(__v16su)__A & (__v16su)__B);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U);
+_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_andnot_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
+_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_andnot_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1151,148 +1116,184 @@ _mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
}
#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \
- (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1); })
+ (__m256)__builtin_shufflevector((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_undefined_ps(), \
+ ((imm) & 1) ? 8 : 0, \
+ ((imm) & 1) ? 9 : 1, \
+ ((imm) & 1) ? 10 : 2, \
+ ((imm) & 1) ? 11 : 3, \
+ ((imm) & 1) ? 12 : 4, \
+ ((imm) & 1) ? 13 : 5, \
+ ((imm) & 1) ? 14 : 6, \
+ ((imm) & 1) ? 15 : 7); })
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \
- (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v8sf)(__m256)(W), \
- (__mmask8)(U)); })
+ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
+ (__v8sf)(W)); })
#define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \
- (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U)); })
+ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
+ (__v8sf)_mm256_setzero_ps()); })
#define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \
- (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
- (int)(imm), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1); })
+ (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_undefined_pd(), \
+ 0 + ((imm) & 0x3) * 2, \
+ 1 + ((imm) & 0x3) * 2); })
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
- (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
- (int)(imm), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U)); })
+ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+ (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
+ (__v2df)(W)); })
#define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
- (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
- (int)(imm), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U)); })
+ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+ (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
+ (__v2df)_mm_setzero_pd()); })
#define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)-1); })
+ (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+ (__v16si)_mm512_undefined_epi32(), \
+ ((imm) & 1) ? 8 : 0, \
+ ((imm) & 1) ? 9 : 1, \
+ ((imm) & 1) ? 10 : 2, \
+ ((imm) & 1) ? 11 : 3, \
+ ((imm) & 1) ? 12 : 4, \
+ ((imm) & 1) ? 13 : 5, \
+ ((imm) & 1) ? 14 : 6, \
+ ((imm) & 1) ? 15 : 7); })
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
+ (__v8si)(W)); })
#define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
+ (__v8si)_mm256_setzero_si256()); })
#define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
- (int)(imm), \
- (__v2di)_mm_setzero_di(), \
- (__mmask8)-1); })
+ (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+ (__v8di)_mm512_undefined_epi32(), \
+ 0 + ((imm) & 0x3) * 2, \
+ 1 + ((imm) & 0x3) * 2); })
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
- (int)(imm), \
- (__v2di)(__m128i)(W), \
- (__mmask8)(U)); })
+ (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
+ (__v2di)(W)); })
#define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
- (int)(imm), \
- (__v2di)_mm_setzero_di(), \
- (__mmask8)(U)); })
+ (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
+ (__v2di)_mm_setzero_di()); })
#define _mm512_insertf32x8(A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
- (__v8sf)(__m256)(B), (int)(imm), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1); })
+ (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_castps256_ps512((__m256)(B)),\
+ ((imm) & 0x1) ? 0 : 16, \
+ ((imm) & 0x1) ? 1 : 17, \
+ ((imm) & 0x1) ? 2 : 18, \
+ ((imm) & 0x1) ? 3 : 19, \
+ ((imm) & 0x1) ? 4 : 20, \
+ ((imm) & 0x1) ? 5 : 21, \
+ ((imm) & 0x1) ? 6 : 22, \
+ ((imm) & 0x1) ? 7 : 23, \
+ ((imm) & 0x1) ? 16 : 8, \
+ ((imm) & 0x1) ? 17 : 9, \
+ ((imm) & 0x1) ? 18 : 10, \
+ ((imm) & 0x1) ? 19 : 11, \
+ ((imm) & 0x1) ? 20 : 12, \
+ ((imm) & 0x1) ? 21 : 13, \
+ ((imm) & 0x1) ? 22 : 14, \
+ ((imm) & 0x1) ? 23 : 15); })
#define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
- (__v8sf)(__m256)(B), (int)(imm), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U)); })
+ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+ (__v16sf)(W)); })
#define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x8_mask((__v16sf)(__m512)(A), \
- (__v8sf)(__m256)(B), (int)(imm), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U)); })
+ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+ (__v16sf)_mm512_setzero_ps()); })
#define _mm512_insertf64x2(A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(imm), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1); })
+ (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\
+ (((imm) & 0x3) == 0) ? 8 : 0, \
+ (((imm) & 0x3) == 0) ? 9 : 1, \
+ (((imm) & 0x3) == 1) ? 8 : 2, \
+ (((imm) & 0x3) == 1) ? 9 : 3, \
+ (((imm) & 0x3) == 2) ? 8 : 4, \
+ (((imm) & 0x3) == 2) ? 9 : 5, \
+ (((imm) & 0x3) == 3) ? 8 : 6, \
+ (((imm) & 0x3) == 3) ? 9 : 7); })
#define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(imm), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U)); })
+ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+ (__v8df)(W)); })
#define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x2_512_mask((__v8df)(__m512d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(imm), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U)); })
+ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+ (__v8df)_mm512_setzero_pd()); })
#define _mm512_inserti32x8(A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
- (__v8si)(__m256i)(B), (int)(imm), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+ (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+ (__v16si)_mm512_castsi256_si512((__m256i)(B)),\
+ ((imm) & 0x1) ? 0 : 16, \
+ ((imm) & 0x1) ? 1 : 17, \
+ ((imm) & 0x1) ? 2 : 18, \
+ ((imm) & 0x1) ? 3 : 19, \
+ ((imm) & 0x1) ? 4 : 20, \
+ ((imm) & 0x1) ? 5 : 21, \
+ ((imm) & 0x1) ? 6 : 22, \
+ ((imm) & 0x1) ? 7 : 23, \
+ ((imm) & 0x1) ? 16 : 8, \
+ ((imm) & 0x1) ? 17 : 9, \
+ ((imm) & 0x1) ? 18 : 10, \
+ ((imm) & 0x1) ? 19 : 11, \
+ ((imm) & 0x1) ? 20 : 12, \
+ ((imm) & 0x1) ? 21 : 13, \
+ ((imm) & 0x1) ? 22 : 14, \
+ ((imm) & 0x1) ? 23 : 15); })
#define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
- (__v8si)(__m256i)(B), (int)(imm), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+ (__v16si)(W)); })
#define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x8_mask((__v16si)(__m512i)(A), \
- (__v8si)(__m256i)(B), (int)(imm), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()); })
#define _mm512_inserti64x2(A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
- (__v2di)(__m128i)(B), \
- (int)(imm), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+ (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+ (__v8di)_mm512_castsi128_si512((__m128i)(B)),\
+ (((imm) & 0x3) == 0) ? 8 : 0, \
+ (((imm) & 0x3) == 0) ? 9 : 1, \
+ (((imm) & 0x3) == 1) ? 8 : 2, \
+ (((imm) & 0x3) == 1) ? 9 : 3, \
+ (((imm) & 0x3) == 2) ? 8 : 4, \
+ (((imm) & 0x3) == 2) ? 9 : 5, \
+ (((imm) & 0x3) == 3) ? 8 : 6, \
+ (((imm) & 0x3) == 3) ? 9 : 7); })
#define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
- (__v2di)(__m128i)(B), \
- (int)(imm), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+ (__v8di)(W)); })
#define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x2_512_mask((__v8di)(__m512i)(A), \
- (__v2di)(__m128i)(B), \
- (int)(imm), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()); })
#define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
(__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index 0bf6582345d4..e6a7217c8967 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -54,6 +54,19 @@ typedef unsigned short __mmask16;
#define _MM_FROUND_TO_ZERO 0x03
#define _MM_FROUND_CUR_DIRECTION 0x04
+/* Constants for integer comparison predicates */
+typedef enum {
+ _MM_CMPINT_EQ, /* Equal */
+ _MM_CMPINT_LT, /* Less than */
+ _MM_CMPINT_LE, /* Less than or Equal */
+ _MM_CMPINT_UNUSED,
+ _MM_CMPINT_NE, /* Not Equal */
+ _MM_CMPINT_NLT, /* Not Less than */
+#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */
+ _MM_CMPINT_NLE /* Not Less than or Equal */
+#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */
+} _MM_CMPINT_ENUM;
+
typedef enum
{
_MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
@@ -503,6 +516,18 @@ _mm512_castsi512_si256 (__m512i __A)
return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
}
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_int2mask(int __a)
+{
+ return (__mmask16)__a;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask2int(__mmask16 __a)
+{
+ return (int)__a;
+}
+
/* Bitwise operators */
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_and_epi32(__m512i __a, __m512i __b)
@@ -737,22 +762,19 @@ _mm512_add_epi64 (__m512i __A, __m512i __B)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_add_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_add_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -762,22 +784,19 @@ _mm512_sub_epi64 (__m512i __A, __m512i __B)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sub_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sub_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -787,22 +806,19 @@ _mm512_add_epi32 (__m512i __A, __m512i __B)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_add_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_add_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -812,22 +828,19 @@ _mm512_sub_epi32 (__m512i __A, __m512i __B)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sub_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sub_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
@@ -1403,57 +1416,45 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_mul_epi32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di) __W, __M);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epi32(__X, __Y),
+ (__v8di)__W);
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- __M);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epi32(__X, __Y),
+ (__v8di)_mm512_setzero_si512 ());
}
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_mul_epu32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di) __W, __M);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epu32(__X, __Y),
+ (__v8di)__W);
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- __M);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epu32(__X, __Y),
+ (__v8di)_mm512_setzero_si512 ());
}
static __inline __m512i __DEFAULT_FN_ATTRS
@@ -1463,21 +1464,19 @@ _mm512_mullo_epi32 (__m512i __A, __m512i __B)
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- __M);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_mullo_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si) __W, __M);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_mullo_epi32(__A, __B),
+ (__v16si)__W);
}
#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
@@ -1977,38 +1976,30 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_add_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_add_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_add_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_add_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
#define _mm512_add_round_pd(A, B, R) __extension__ ({ \
@@ -2120,40 +2111,30 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_sub_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_sub_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_sub_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_sub_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
@@ -2265,40 +2246,30 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_mul_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_mul_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_mul_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_mul_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
@@ -2417,21 +2388,16 @@ _mm512_div_pd(__m512d __a, __m512d __b)
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_div_pd(__A, __B),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_div_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
}
static __inline __m512 __DEFAULT_FN_ATTRS
@@ -2442,21 +2408,16 @@ _mm512_div_ps(__m512 __a, __m512 __b)
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_div_ps(__A, __B),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_div_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
}
#define _mm512_div_round_pd(A, B, R) __extension__ ({ \
@@ -3443,71 +3404,94 @@ _mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
}
#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
- (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(I), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+ (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
+ (__v8di)(__m512i)(A), \
+ ((int)(I) & 0x7) + 0, \
+ ((int)(I) & 0x7) + 1, \
+ ((int)(I) & 0x7) + 2, \
+ ((int)(I) & 0x7) + 3, \
+ ((int)(I) & 0x7) + 4, \
+ ((int)(I) & 0x7) + 5, \
+ ((int)(I) & 0x7) + 6, \
+ ((int)(I) & 0x7) + 7); })
#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
- (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(imm), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+ (__v8di)(__m512i)(W)); })
#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
- (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(imm), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()); })
#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
- (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(I), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+ (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
+ (__v16si)(__m512i)(A), \
+ ((int)(I) & 0xf) + 0, \
+ ((int)(I) & 0xf) + 1, \
+ ((int)(I) & 0xf) + 2, \
+ ((int)(I) & 0xf) + 3, \
+ ((int)(I) & 0xf) + 4, \
+ ((int)(I) & 0xf) + 5, \
+ ((int)(I) & 0xf) + 6, \
+ ((int)(I) & 0xf) + 7, \
+ ((int)(I) & 0xf) + 8, \
+ ((int)(I) & 0xf) + 9, \
+ ((int)(I) & 0xf) + 10, \
+ ((int)(I) & 0xf) + 11, \
+ ((int)(I) & 0xf) + 12, \
+ ((int)(I) & 0xf) + 13, \
+ ((int)(I) & 0xf) + 14, \
+ ((int)(I) & 0xf) + 15); })
#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
- (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(imm), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+ (__v16si)(__m512i)(W)); })
#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
- (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(imm), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()); })
/* Vector Extract */
-#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \
- (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
- (__v4df)_mm256_setzero_si256(), \
- (__mmask8)-1); })
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \
+ (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_undefined_pd(), \
+ ((I) & 1) ? 4 : 0, \
+ ((I) & 1) ? 5 : 1, \
+ ((I) & 1) ? 6 : 2, \
+ ((I) & 1) ? 7 : 3); })
#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
- (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__v4df)(__m256d)(W), \
- (__mmask8)(U)); })
+ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+ (__v4df)(W)); })
#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
- (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)(U)); })
+ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+ (__v4df)_mm256_setzero_pd()); })
-#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \
- (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1); })
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \
+ (__m128)__builtin_shufflevector((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_undefined_ps(), \
+ 0 + ((I) & 0x3) * 4, \
+ 1 + ((I) & 0x3) * 4, \
+ 2 + ((I) & 0x3) * 4, \
+ 3 + ((I) & 0x3) * 4); })
#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
- (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U)); })
+ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+ (__v4sf)(W)); })
#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
- (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)); })
+ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+ (__v4sf)_mm_setzero_ps()); })
+
/* Vector Blend */
static __inline __m512d __DEFAULT_FN_ATTRS
@@ -3556,10 +3540,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
#define _mm512_cmp_ps_mask(A, B, P) \
_mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
_mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_cmpeq_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
+
#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
(__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(P), \
@@ -3572,10 +3595,49 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
#define _mm512_cmp_pd_mask(A, B, P) \
_mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
_mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_cmpeq_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
+
/* Conversion */
#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
@@ -3682,26 +3744,35 @@ _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
static __inline __m512d __DEFAULT_FN_ATTRS
_mm512_cvtepi32_pd(__m256i __A)
{
- return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) -1);
+ return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
{
- return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
- (__v8df) __W,
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepi32_pd(__A),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
{
- return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
- (__v8df) _mm512_setzero_pd (),
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepi32_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32lo_pd(__m512i __A)
+{
+ return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+ return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -3734,26 +3805,35 @@ _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
static __inline __m512d __DEFAULT_FN_ATTRS
_mm512_cvtepu32_pd(__m256i __A)
{
- return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) -1);
+ return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
{
- return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
- (__v8df) __W,
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepu32_pd(__A),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
{
- return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
- (__v8df) _mm512_setzero_pd (),
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepu32_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32lo_pd(__m512i __A)
+{
+ return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+ return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
}
#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
@@ -3798,6 +3878,24 @@ _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
_MM_FROUND_CUR_DIRECTION);
}
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpd_pslo (__m512d __A)
+{
+ return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
+ (__v8sf) _mm256_setzero_ps (),
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
+{
+ return (__m512) __builtin_shufflevector (
+ (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
+ __U, __A),
+ (__v8sf) _mm256_setzero_ps (),
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
(__v16hi)_mm256_undefined_si256(), \
@@ -4919,263 +5017,227 @@ _mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi32 (__m128i __A)
+_mm512_cvtepi8_epi32(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi8_epi32(__A),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
+_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi8_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi8_epi64 (__m128i __A)
+_mm512_cvtepi8_epi64(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi8_epi64(__A),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi8_epi64(__A),
+ (__v8di)_mm512_setzero_si512 ());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi32_epi64 (__m256i __X)
+_mm512_cvtepi32_epi64(__m256i __X)
{
- return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
{
- return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi32_epi64(__X),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
+_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
{
- return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi32_epi64(__X),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi16_epi32 (__m256i __A)
+_mm512_cvtepi16_epi32(__m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi16_epi32(__A),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
+_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi16_epi32(__A),
+ (__v16si)_mm512_setzero_si512 ());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepi16_epi64 (__m128i __A)
+_mm512_cvtepi16_epi64(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi16_epi64(__A),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi16_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi32 (__m128i __A)
+_mm512_cvtepu8_epi32(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu8_epi32(__A),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
+_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu8_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu8_epi64 (__m128i __A)
+_mm512_cvtepu8_epi64(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu8_epi64(__A),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu8_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu32_epi64 (__m256i __X)
+_mm512_cvtepu32_epi64(__m256i __X)
{
- return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
{
- return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu32_epi64(__X),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
+_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
{
- return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu32_epi64(__X),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu16_epi32 (__m256i __A)
+_mm512_cvtepu16_epi32(__m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu16_epi32(__A),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
+_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu16_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_cvtepu16_epi64 (__m128i __A)
+_mm512_cvtepu16_epi64(__m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu16_epi64(__A),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
{
- return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu16_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -5393,67 +5455,91 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U)); })
-#define _mm512_slli_epi32(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
-
-#define _mm512_mask_slli_epi32(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
-
-#define _mm512_maskz_slli_epi32(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_pslldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi32(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+}
-#define _mm512_slli_epi64(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_slli_epi32(__A, __B),
+ (__v16si)__W);
+}
-#define _mm512_mask_slli_epi64(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_slli_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
-#define _mm512_maskz_slli_epi64(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psllqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi64(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_slli_epi64(__A, __B),
+ (__v8di)__W);
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_slli_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
-#define _mm512_srli_epi32(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi32(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+}
-#define _mm512_mask_srli_epi32(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srli_epi32(__A, __B),
+ (__v16si)__W);
+}
-#define _mm512_maskz_srli_epi32(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrldi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srli_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
-#define _mm512_srli_epi64(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi64(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+}
-#define _mm512_mask_srli_epi64(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srli_epi64(__A, __B),
+ (__v8di)__W);
+}
-#define _mm512_maskz_srli_epi64(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srli_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
@@ -5911,8 +5997,10 @@ _mm512_kmov (__mmask16 __A)
(int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
(int)(P), (int)(R)); })
+#ifdef __x86_64__
#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
@@ -5926,351 +6014,267 @@ _mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi32 (__m512i __A, __m128i __B)
+_mm512_sll_epi32(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sll_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sll_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sll_epi64 (__m512i __A, __m128i __B)
+_mm512_sll_epi64(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sll_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sll_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+_mm512_sllv_epi32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sllv_epi32(__X, __Y),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sllv_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+_mm512_sllv_epi64(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_undefined_pd (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sllv_epi64(__X, __Y),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sllv_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi32 (__m512i __A, __m128i __B)
+_mm512_sra_epi32(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sra_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sra_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_sra_epi64 (__m512i __A, __m128i __B)
+_mm512_sra_epi64(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sra_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sra_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+_mm512_srav_epi32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srav_epi32(__X, __Y),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srav_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+_mm512_srav_epi64(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srav_epi64(__X, __Y),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srav_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi32 (__m512i __A, __m128i __B)
+_mm512_srl_epi32(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srl_epi32(__A, __B),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
- (__v4si) __B,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srl_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srl_epi64 (__m512i __A, __m128i __B)
+_mm512_srl_epi64(__m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srl_epi64(__A, __B),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
{
- return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
- (__v2di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srl_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+_mm512_srlv_epi32(__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si) __W,
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srlv_epi32(__X, __Y),
+ (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
- (__v16si) __Y,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srlv_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di) __W,
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srlv_epi64(__X, __Y),
+ (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
{
- return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
- (__v8di) __Y,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srlv_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
}
#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
@@ -6309,8 +6313,10 @@ _mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
(__v8di)(__m512i)(C), (int)(imm), \
(__mmask8)(U)); })
+#ifdef __x86_64__
#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6328,6 +6334,7 @@ _mm_cvtsd_u32 (__m128d __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
(unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
(int)(R)); })
@@ -6339,6 +6346,7 @@ _mm_cvtsd_u64 (__m128d __A)
__A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6346,11 +6354,13 @@ _mm_cvtsd_u64 (__m128d __A)
#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+#ifdef __x86_64__
#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#endif
#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
(unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6362,6 +6372,7 @@ _mm_cvtss_u32 (__m128 __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
(unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
(int)(R)); })
@@ -6373,6 +6384,7 @@ _mm_cvtss_u64 (__m128 __A)
__A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6387,6 +6399,7 @@ _mm_cvttsd_i32 (__m128d __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
@@ -6399,6 +6412,7 @@ _mm_cvttsd_i64 (__m128d __A)
return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
(unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
@@ -6410,6 +6424,7 @@ _mm_cvttsd_u32 (__m128d __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
(unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
(int)(R)); })
@@ -6421,6 +6436,7 @@ _mm_cvttsd_u64 (__m128d __A)
__A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
(int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6435,6 +6451,7 @@ _mm_cvttss_i32 (__m128 __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
(long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
@@ -6447,6 +6464,7 @@ _mm_cvttss_i64 (__m128 __A)
return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
(unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })
@@ -6458,6 +6476,7 @@ _mm_cvttss_u32 (__m128 __A)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
(unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
(int)(R)); })
@@ -6469,6 +6488,7 @@ _mm_cvttss_u64 (__m128 __A)
__A,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
@@ -6556,61 +6576,47 @@ _mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
(__v16sf)_mm512_setzero_ps()); })
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_permutevar_pd (__m512d __A, __m512i __C)
+_mm512_permutevar_pd(__m512d __A, __m512i __C)
{
- return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
- (__v8di) __C,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1);
+ return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
{
- return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
- (__v8di) __C,
- (__v8df) __W,
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_permutevar_pd(__A, __C),
+ (__v8df)__W);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
{
- return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
- (__v8di) __C,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_permutevar_pd(__A, __C),
+ (__v8df)_mm512_setzero_pd());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_permutevar_ps (__m512 __A, __m512i __C)
+_mm512_permutevar_ps(__m512 __A, __m512i __C)
{
- return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
- (__v16si) __C,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1);
+ return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
{
- return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
- (__v16si) __C,
- (__v16sf) __W,
- (__mmask16) __U);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_permutevar_ps(__A, __C),
+ (__v16sf)__W);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
{
- return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
- (__v16si) __C,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_permutevar_ps(__A, __C),
+ (__v16sf)_mm512_setzero_ps());
}
static __inline __m512d __DEFAULT_FN_ATTRS
@@ -7028,35 +7034,48 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
(__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION); })
-#define _mm512_srai_epi32(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi32(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+}
-#define _mm512_mask_srai_epi32(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+ (__v16si)_mm512_srai_epi32(__A, __B), \
+ (__v16si)__W);
+}
-#define _mm512_maskz_srai_epi32(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psradi512_mask((__v16si)(__m512i)(A), (int)(B), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+ (__v16si)_mm512_srai_epi32(__A, __B), \
+ (__v16si)_mm512_setzero_si512());
+}
-#define _mm512_srai_epi64(A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi64(__m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+}
-#define _mm512_mask_srai_epi64(W, U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+ (__v8di)_mm512_srai_epi64(__A, __B), \
+ (__v8di)__W);
+}
-#define _mm512_maskz_srai_epi64(U, A, B) __extension__ ({ \
- (__m512i)__builtin_ia32_psraqi512_mask((__v8di)(__m512i)(A), (int)(B), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+ (__v8di)_mm512_srai_epi64(__A, __B), \
+ (__v8di)_mm512_setzero_si512());
+}
#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
(__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
@@ -7832,107 +7851,145 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
__builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
}
-#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)_mm_undefined_si128(), \
- (__mmask8)-1); })
+#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
+ (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+ (__v16si)_mm512_undefined_epi32(), \
+ 0 + ((imm) & 0x3) * 4, \
+ 1 + ((imm) & 0x3) * 4, \
+ 2 + ((imm) & 0x3) * 4, \
+ 3 + ((imm) & 0x3) * 4); })
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+ (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+ (__v4si)__W); })
#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+ (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+ (__v4si)_mm_setzero_si128()); })
-#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)_mm256_undefined_si256(), \
- (__mmask8)-1); })
+#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
+ (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+ (__v8di)_mm512_undefined_epi32(), \
+ ((imm) & 1) ? 4 : 0, \
+ ((imm) & 1) ? 5 : 1, \
+ ((imm) & 1) ? 6 : 2, \
+ ((imm) & 1) ? 7 : 3); })
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)(__m256i)(W), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+ (__v4di)__W); })
#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+ (__v4di)_mm256_setzero_si256()); })
#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
- (__v4df)(__m256d)(B), (int)(imm), \
- (__v8df)_mm512_undefined_pd(), \
- (__mmask8)-1); })
+ (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
+ ((imm) & 0x1) ? 0 : 8, \
+ ((imm) & 0x1) ? 1 : 9, \
+ ((imm) & 0x1) ? 2 : 10, \
+ ((imm) & 0x1) ? 3 : 11, \
+ ((imm) & 0x1) ? 8 : 4, \
+ ((imm) & 0x1) ? 9 : 5, \
+ ((imm) & 0x1) ? 10 : 6, \
+ ((imm) & 0x1) ? 11 : 7); })
#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
- (__v4df)(__m256d)(B), (int)(imm), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U)); })
+ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+ (__v8df)(W)); })
#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
- (__m512d)__builtin_ia32_insertf64x4_mask((__v8df)(__m512d)(A), \
- (__v4df)(__m256d)(B), (int)(imm), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U)); })
+ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+ (__v8df)_mm512_setzero_pd()); })
#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1); })
+ (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+ (__v8di)_mm512_castsi256_si512((__m256i)(B)), \
+ ((imm) & 0x1) ? 0 : 8, \
+ ((imm) & 0x1) ? 1 : 9, \
+ ((imm) & 0x1) ? 2 : 10, \
+ ((imm) & 0x1) ? 3 : 11, \
+ ((imm) & 0x1) ? 8 : 4, \
+ ((imm) & 0x1) ? 9 : 5, \
+ ((imm) & 0x1) ? 10 : 6, \
+ ((imm) & 0x1) ? 11 : 7); })
#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+ (__v8di)(W)); })
#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti64x4_mask((__v8di)(__m512i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U)); })
+ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()); })
#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1); })
+ (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_castps128_ps512((__m128)(B)),\
+ (((imm) & 0x3) == 0) ? 16 : 0, \
+ (((imm) & 0x3) == 0) ? 17 : 1, \
+ (((imm) & 0x3) == 0) ? 18 : 2, \
+ (((imm) & 0x3) == 0) ? 19 : 3, \
+ (((imm) & 0x3) == 1) ? 16 : 4, \
+ (((imm) & 0x3) == 1) ? 17 : 5, \
+ (((imm) & 0x3) == 1) ? 18 : 6, \
+ (((imm) & 0x3) == 1) ? 19 : 7, \
+ (((imm) & 0x3) == 2) ? 16 : 8, \
+ (((imm) & 0x3) == 2) ? 17 : 9, \
+ (((imm) & 0x3) == 2) ? 18 : 10, \
+ (((imm) & 0x3) == 2) ? 19 : 11, \
+ (((imm) & 0x3) == 3) ? 16 : 12, \
+ (((imm) & 0x3) == 3) ? 17 : 13, \
+ (((imm) & 0x3) == 3) ? 18 : 14, \
+ (((imm) & 0x3) == 3) ? 19 : 15); })
#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U)); })
+ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+ (__v16sf)(W)); })
#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
- (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)(__m512)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U)); })
+ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+ (__v16sf)_mm512_setzero_ps()); })
#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1); })
+ (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+ (__v16si)_mm512_castsi128_si512((__m128i)(B)),\
+ (((imm) & 0x3) == 0) ? 16 : 0, \
+ (((imm) & 0x3) == 0) ? 17 : 1, \
+ (((imm) & 0x3) == 0) ? 18 : 2, \
+ (((imm) & 0x3) == 0) ? 19 : 3, \
+ (((imm) & 0x3) == 1) ? 16 : 4, \
+ (((imm) & 0x3) == 1) ? 17 : 5, \
+ (((imm) & 0x3) == 1) ? 18 : 6, \
+ (((imm) & 0x3) == 1) ? 19 : 7, \
+ (((imm) & 0x3) == 2) ? 16 : 8, \
+ (((imm) & 0x3) == 2) ? 17 : 9, \
+ (((imm) & 0x3) == 2) ? 18 : 10, \
+ (((imm) & 0x3) == 2) ? 19 : 11, \
+ (((imm) & 0x3) == 3) ? 16 : 12, \
+ (((imm) & 0x3) == 3) ? 17 : 13, \
+ (((imm) & 0x3) == 3) ? 18 : 14, \
+ (((imm) & 0x3) == 3) ? 19 : 15); })
#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+ (__v16si)(W)); })
#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
- (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)(__m512i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U)); })
+ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()); })
#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
@@ -8275,17 +8332,17 @@ __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ (__v4sf) __A,
(__v4sf) __B,
- (__v4sf) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
- (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ (__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8323,17 +8380,17 @@ _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ (__v4sf) __A,
-(__v4sf) __B,
- (__v4sf) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
- (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ (__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8355,33 +8412,33 @@ _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
(__v4sf) __X,
- -(__v4sf) __Y,
+ (__v4sf) __Y,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
- (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+ (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
(__v4sf)(__m128)(X), \
- -(__v4sf)(__m128)(Y), (__mmask8)(U), \
+ (__v4sf)(__m128)(Y), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ -(__v4sf) __A,
(__v4sf) __B,
- (__v4sf) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
- (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ -(__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8419,17 +8476,17 @@ _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask (-(__v4sf) __A,
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ -(__v4sf) __A,
-(__v4sf) __B,
- (__v4sf) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
- (__m128)__builtin_ia32_vfmaddss3_mask(-(__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ -(__v4sf)(__m128)(A), \
+ -(__v4sf)(__m128)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -8451,33 +8508,33 @@ _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
{
- return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
+ return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W,
(__v4sf) __X,
- -(__v4sf) __Y,
+ (__v4sf) __Y,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
- (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
+ (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \
(__v4sf)(__m128)(X), \
- -(__v4sf)(__m128)(Y), (__mmask8)(U), \
+ (__v4sf)(__m128)(Y), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+ (__v2df) __A,
(__v2df) __B,
- (__v2df) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ (__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8515,17 +8572,17 @@ _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+ (__v2df) __A,
-(__v2df) __B,
- (__v2df) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ (__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8547,33 +8604,33 @@ _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
(__v2df) __X,
- -(__v2df) __Y,
+ (__v2df) __Y,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+ (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
(__v2df)(__m128d)(X), \
- -(__v2df)(__m128d)(Y), \
+ (__v2df)(__m128d)(Y), \
(__mmask8)(U), (int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+ -(__v2df) __A,
(__v2df) __B,
- (__v2df) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ -(__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8611,17 +8668,17 @@ _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask ( -(__v2df) __A,
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+ -(__v2df) __A,
-(__v2df) __B,
- (__v2df) __W,
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask(-(__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ -(__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), (__mmask8)(U), \
(int)(R)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -8644,17 +8701,17 @@ _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
{
- return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) (__W),
+ return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W),
(__v2df) __X,
- -(__v2df) (__Y),
+ (__v2df) (__Y),
(__mmask8) __U,
_MM_FROUND_CUR_DIRECTION);
}
#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
- (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
+ (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \
(__v2df)(__m128d)(X), \
- -(__v2df)(__m128d)(Y), \
+ (__v2df)(__m128d)(Y), \
(__mmask8)(U), (int)(R)); })
#define _mm512_permutex_pd(X, C) __extension__ ({ \
@@ -9041,6 +9098,101 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
(__v16sf)_mm512_setzero_ps());
}
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ __m128 res = __A;
+ res[0] = (__U & 1) ? __B[0] : __W[0];
+ return res;
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ __m128 res = __A;
+ res[0] = (__U & 1) ? __B[0] : 0;
+ return res;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ __m128d res = __A;
+ res[0] = (__U & 1) ? __B[0] : __W[0];
+ return res;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ __m128d res = __A;
+ res[0] = (__U & 1) ? __B[0] : 0;
+ return res;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_storess128_mask ((__v16sf *)__W,
+ (__v16sf) _mm512_castps128_ps512(__A),
+ (__mmask16) __U & (__mmask16)1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_storesd128_mask ((__v8df *)__W,
+ (__v8df) _mm512_castpd128_pd512(__A),
+ (__mmask8) __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
+{
+ __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
+ (__v4sf) {0.0, 0.0, 0.0, 0.0},
+ 0, 4, 4, 4);
+
+ return (__m128) __builtin_shufflevector(
+ __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+ (__v16sf) _mm512_castps128_ps512(src),
+ (__mmask16) __U & 1),
+ _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_load_ss (__mmask8 __U, const float* __A)
+{
+ return (__m128) __builtin_shufflevector(
+ __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+ (__v16sf) _mm512_setzero_ps(),
+ (__mmask16) __U & 1),
+ _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
+{
+ __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
+ (__v2df) {0.0, 0.0}, 0, 2);
+
+ return (__m128d) __builtin_shufflevector(
+ __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+ (__v8df) _mm512_castpd128_pd512(src),
+ (__mmask8) __U & 1),
+ _mm512_undefined_pd(), 0, 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_load_sd (__mmask8 __U, const double* __A)
+{
+ return (__m128d) __builtin_shufflevector(
+ __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+ (__v8df) _mm512_setzero_pd(),
+ (__mmask8) __U & 1),
+ _mm512_undefined_pd(), 0, 1);
+}
+
#define _mm512_shuffle_epi32(A, I) __extension__ ({ \
(__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
(__v16si)_mm512_undefined_epi32(), \
@@ -9243,6 +9395,18 @@ _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
_MM_FROUND_CUR_DIRECTION);
}
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpslo_pd (__m512 __A)
+{
+ return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
+{
+ return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
+}
+
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
@@ -9340,14 +9504,17 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
}
#define _mm_cvtss_i32 _mm_cvtss_si32
-#define _mm_cvtss_i64 _mm_cvtss_si64
#define _mm_cvtsd_i32 _mm_cvtsd_si32
-#define _mm_cvtsd_i64 _mm_cvtsd_si64
#define _mm_cvti32_sd _mm_cvtsi32_sd
-#define _mm_cvti64_sd _mm_cvtsi64_sd
#define _mm_cvti32_ss _mm_cvtsi32_ss
+#ifdef __x86_64__
+#define _mm_cvtss_i64 _mm_cvtss_si64
+#define _mm_cvtsd_i64 _mm_cvtsd_si64
+#define _mm_cvti64_sd _mm_cvtsi64_sd
#define _mm_cvti64_ss _mm_cvtsi64_ss
+#endif
+#ifdef __x86_64__
#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
(int)(R)); })
@@ -9355,6 +9522,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
(int)(R)); })
+#endif
#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
@@ -9362,6 +9530,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+#ifdef __x86_64__
#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
(int)(R)); })
@@ -9369,6 +9538,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
(int)(R)); })
+#endif
#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
@@ -9412,6 +9582,7 @@ _mm_cvtu32_sd (__m128d __A, unsigned __B)
return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
}
+#ifdef __x86_64__
#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
(unsigned long long)(B), (int)(R)); })
@@ -9422,6 +9593,7 @@ _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
@@ -9434,6 +9606,7 @@ _mm_cvtu32_ss (__m128 __A, unsigned __B)
_MM_FROUND_CUR_DIRECTION);
}
+#ifdef __x86_64__
#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
(unsigned long long)(B), (int)(R)); })
@@ -9444,6 +9617,7 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
_MM_FROUND_CUR_DIRECTION);
}
+#endif
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
@@ -9452,12 +9626,14 @@ _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
__M);
}
+#ifdef __x86_64__
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
{
return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
__M);
}
+#endif
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_set_epi32 (int __A, int __B, int __C, int __D,
@@ -9514,27 +9690,553 @@ _mm512_set_ps (float __A, float __B, float __C, float __D,
(e4),(e3),(e2),(e1),(e0))
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_abs_ps(__m512 A)
+_mm512_abs_ps(__m512 __A)
{
- return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+ return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_abs_ps(__m512 W, __mmask16 K, __m512 A)
+_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
{
- return (__m512)_mm512_mask_and_epi32((__m512i)W, K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)A) ;
+ return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_abs_pd(__m512d A)
+_mm512_abs_pd(__m512d __A)
{
- return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A) ;
+ return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_abs_pd(__m512d W, __mmask8 K, __m512d A)
-{
- return (__m512d)_mm512_mask_and_epi64((__v8di)W, K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)A);
+_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
+{
+ return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
+}
+
+// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
+// outputs. This class of vector operation forms the basis of many scientific
+// computations. In vector-reduction arithmetic, the evaluation off is
+// independent of the order of the input elements of V.
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2 - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+
+#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \
+ __extension__({ \
+ __m256##T1 Vec256 = __builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, 1, 2, 3) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 4, 5, 6, 7); \
+ __m128##T1 Vec128 = __builtin_shufflevector( \
+ (__v4d##T2)Vec256, \
+ (__v4d##T2)Vec256, \
+ 0, 1) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v4d##T2)Vec256, \
+ (__v4d##T2)Vec256, \
+ 2, 3); \
+ Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \
+ (__v2d##T2)Vec128, 0, -1) \
+ Operator \
+ __builtin_shufflevector((__v2d##T2)Vec128, \
+ (__v2d##T2)Vec128, 1, -1); \
+ return Vec128[0]; \
+ })
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) {
+ _mm512_reduce_operator_64bit(__W, +, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) {
+ _mm512_reduce_operator_64bit(__W, *, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) {
+ _mm512_reduce_operator_64bit(__W, &, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) {
+ _mm512_reduce_operator_64bit(__W, |, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
+ _mm512_reduce_operator_64bit(__W, +, f, d);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
+ _mm512_reduce_operator_64bit(__W, *, f, d);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element.
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2 - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for packed double-precision.
+// T3 - Can be Pd for packed double or q for q-word.
+
+#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \
+ Mask, T2, T1, T3) \
+ __extension__({ \
+ Vec512 = __builtin_ia32_select##T3##_512( \
+ (__mmask8)Mask, \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512Neutral); \
+ _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \
+ })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+ &, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M,
+ i, i, q);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
+ f, d, pd);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M,
+ f, d, pd);
+}
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2 - Can get 'i' for int and ' ' for packed single.
+// T1 - Can get 'i' for int and 'f' for float.
+
+#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \
+ __m256##T1 Vec256 = \
+ (__m256##T1)(__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, 2, 3, 4, 5, 6, 7) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 8, 9, 10, 11, 12, 13, 14, 15)); \
+ __m128##T1 Vec128 = \
+ (__m128##T1)(__builtin_shufflevector( \
+ (__v8s##T2)Vec256, \
+ (__v8s##T2)Vec256, \
+ 0, 1, 2, 3) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v8s##T2)Vec256, \
+ (__v8s##T2)Vec256, \
+ 4, 5, 6, 7)); \
+ Vec128 = (__m128##T1)(__builtin_shufflevector( \
+ (__v4s##T2)Vec128, \
+ (__v4s##T2)Vec128, \
+ 0, 1, -1, -1) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v4s##T2)Vec128, \
+ (__v4s##T2)Vec128, \
+ 2, 3, -1, -1)); \
+ Vec128 = (__m128##T1)(__builtin_shufflevector( \
+ (__v4s##T2)Vec128, \
+ (__v4s##T2)Vec128, \
+ 0, -1, -1, -1) \
+ Operator \
+ __builtin_shufflevector( \
+ (__v4s##T2)Vec128, \
+ (__v4s##T2)Vec128, \
+ 1, -1, -1, -1)); \
+ return Vec128[0]; \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_add_epi32(__m512i __W) {
+ _mm512_reduce_operator_32bit(__W, +, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_mul_epi32(__m512i __W) {
+ _mm512_reduce_operator_32bit(__W, *, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_and_epi32(__m512i __W) {
+ _mm512_reduce_operator_32bit(__W, &, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_or_epi32(__m512i __W) {
+ _mm512_reduce_operator_32bit(__W, |, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_add_ps(__m512 __W) {
+ _mm512_reduce_operator_32bit(__W, +, f, );
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_mul_ps(__m512 __W) {
+ _mm512_reduce_operator_32bit(__W, *, f, );
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element.
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2 - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+// T3 - Can be Ps for packed single or d for d-word.
+
+#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \
+ Mask, T2, T1, T3) \
+ __extension__({ \
+ Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
+ (__mmask16)Mask, \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512Neutral); \
+ _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
+ i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
+}
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+// This macro uses only intrinsics from the AVX512F feature.
+
+// Vec512 - Vector with size of 512.
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+// __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+
+#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, 1, 2, 3, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 4, 5, 6, 7, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, 1, -1, -1, -1, -1, -1, -1),\
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 2, 3, -1, -1, -1, -1, -1, \
+ -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, -1, -1, -1, -1, -1, -1, -1),\
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 1, -1, -1, -1, -1, -1, -1, -1))\
+ ; \
+ return Vec512[0]; \
+ })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epi64(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu64(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_reduce_max_pd(__m512d __V) {
+ _mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
+(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu64(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_reduce_min_pd(__m512d __V) {
+ _mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x8000000000000000}
+// {max_epu,0x0000000000000000}
+// {max_pd, 0xFFF0000000000000}
+// {min_epi,0x7FFFFFFFFFFFFFFF}
+// {min_epu,0xFFFFFFFFFFFFFFFF}
+// {min_pd, 0x7FF0000000000000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+// __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+// [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
+ T2, T3, Mask) \
+ __extension__({ \
+ Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
+ (__mmask8)Mask, \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512Neutral); \
+ _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \
+ })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
+ max_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
+ max_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()),
+ max_pd, d, f, pd, __M);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+ min_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+ min_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()),
+ min_pd, d, f, pd, __M);
+}
+
+// Vec512 - Vector with size 512.
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+// __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+
+#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, 2, 3, 4, 5, 6, 7, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, 2, 3, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 4, 5, 6, 7, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 2, 3, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, -1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 1, -1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ return Vec512[0]; \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
+ _mm512_reduce_maxMin_32bit(a, max_ps, , f);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
+ _mm512_reduce_maxMin_32bit(a, min_ps, , f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x80000000}
+// {max_epu,0x00000000}
+// {max_ps, 0xFF800000}
+// {min_epi,0x7FFFFFFF}
+// {min_epu,0xFFFFFFFF}
+// {min_ps, 0x7F800000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+// __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+// [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
+ T2, T3, Mask) \
+ __extension__({ \
+ Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
+ (__mmask16)Mask, \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512Neutral); \
+ _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
+ i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
+ i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f,
+ ps, __M);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
+ i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
+ i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f,
+ ps, __M);
}
#undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/avx512vlbwintrin.h b/lib/Headers/avx512vlbwintrin.h
index 990e992a113f..3b58d043395a 100644
--- a/lib/Headers/avx512vlbwintrin.h
+++ b/lib/Headers/avx512vlbwintrin.h
@@ -615,172 +615,143 @@ _mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
- return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __U);
+_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_add_epi8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi)
- _mm256_setzero_si256 (),
- (__mmask32) __U);
+_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_add_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_add_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_add_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __U);
+_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_sub_epi8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi)
- _mm256_setzero_si256 (),
- (__mmask32) __U);
+_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_sub_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sub_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sub_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
+
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __U);
+_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_add_epi8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi)
- _mm_setzero_si128 (),
- (__mmask16) __U);
+_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_add_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_add_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_add_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __U);
+_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_sub_epi8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi)
- _mm_setzero_si128 (),
- (__mmask16) __U);
+_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_sub_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sub_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sub_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mullo_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mullo_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mullo_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mullo_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -816,937 +787,802 @@ _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
- (__v16qi) __W,
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_abs_epi8(__A),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
+_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_abs_epi8(__A),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A)
{
- return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
- (__v32qi) __W,
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_abs_epi8(__A),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
{
- return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_abs_epi8(__A),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_abs_epi16(__A),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
+_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_abs_epi16(__A),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A)
{
- return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_abs_epi16(__A),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
+_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A)
{
- return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_abs_epi16(__A),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v8hi) _mm_setzero_si128 (), __M);
+_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_packs_epi32(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packs_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v8hi) __W, __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_packs_epi32(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v16hi) _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_packs_epi32(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v16hi) __W, __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_packs_epi32(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v16qi) _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_packs_epi16(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v16qi) __W,
- __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_packs_epi16(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_packs_epi16(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v32qi) __W,
- __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_packs_epi16(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v8hi) _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_packus_epi32(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packus_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_packus_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v8hi) __W, __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_packus_epi32(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v16hi) _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_packus_epi32(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v16hi) __W,
- __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_packus_epi32(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v16qi) _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_packus_epi16(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v16qi) __W,
- __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_packus_epi16(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_packus_epi16(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v32qi) __W,
- __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_packus_epi16(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
- __m128i __B)
+_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_adds_epi8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_adds_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_adds_epi8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_adds_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_adds_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_adds_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_adds_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_adds_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
- __m128i __B)
+_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_adds_epu8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_adds_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_adds_epu8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_adds_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_adds_epu16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_adds_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_adds_epu16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_adds_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
- __m128i __B)
+_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_avg_epu8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_avg_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_avg_epu8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_avg_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_avg_epu16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_avg_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_avg_epu16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_avg_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_max_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_max_epi8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_max_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_max_epi8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_max_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B)
+_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_max_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_max_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_max_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_max_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_max_epu8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_max_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_max_epu8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_max_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B)
+_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_max_epu16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_max_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_max_epu16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_min_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_min_epi8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_min_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_min_epi8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_min_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B)
+_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_min_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_min_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_min_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_min_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_min_epu8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_min_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_min_epu8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_min_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B)
+_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_min_epu16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_min_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_min_epu16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
- __m128i __B)
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_shuffle_epi8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_shuffle_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_shuffle_epi8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_shuffle_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
- __m128i __B)
+_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_subs_epi8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_subs_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_subs_epi8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_subs_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_subs_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_subs_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_subs_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_subs_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
- __m128i __B)
+_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) __W,
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_subs_epu8(__A, __B),
+ (__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
- (__v16qi) __B,
- (__v16qi) _mm_setzero_si128 (),
- (__mmask16) __U);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_subs_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) __W,
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_subs_epu8(__A, __B),
+ (__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
- (__v32qi) __B,
- (__v32qi) _mm256_setzero_si256 (),
- (__mmask32) __U);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_subs_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_subs_epu16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_subs_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B)
-{
- return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_subs_epu16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_subs_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -1828,69 +1664,60 @@ _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
- return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
- (__v16qi) __Y,
- (__v8hi) __W,
- (__mmask8) __U);
+_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_maddubs_epi16(__X, __Y),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
- return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
- (__v16qi) __Y,
- (__v8hi) _mm_setzero_si128(),
- (__mmask8) __U);
+_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_maddubs_epi16(__X, __Y),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
- __m256i __Y) {
- return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
- (__v32qi) __Y,
- (__v16hi) __W,
- (__mmask16) __U);
+_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
+ __m256i __Y) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
- return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
- (__v32qi) __Y,
- (__v16hi) _mm256_setzero_si256(),
- (__mmask16) __U);
+_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B) {
- return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v4si) __W,
- (__mmask8) __U);
+_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_madd_epi16(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v4si) _mm_setzero_si128(),
- (__mmask8) __U);
+_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_madd_epi16(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v8si) __W,
- (__mmask8) __U);
+_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_madd_epi16(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v8si) _mm256_setzero_si256(),
- (__mmask8) __U);
+_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_madd_epi16(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -2056,104 +1883,89 @@ _mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
{
__builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
}
+
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
- return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
- (__v8hi) __Y,
- (__v8hi) __W,
- (__mmask8) __U);
+_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) {
- return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
- (__v8hi) __Y,
- (__v8hi) _mm_setzero_si128(),
- (__mmask8) __U);
+_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
- return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
- (__v16hi) __Y,
- (__v16hi) __W,
- (__mmask16) __U);
+_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) {
- return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
- (__v16hi) __Y,
- (__v16hi) _mm256_setzero_si256(),
- (__mmask16) __U);
+_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B) {
- return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhi_epu16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128(),
- (__mmask8) __U);
+_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhi_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B) {
- return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhi_epu16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256(),
- (__mmask16) __U);
+_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhi_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B) {
- return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhi_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) _mm_setzero_si128(),
- (__mmask8) __U);
+_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhi_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B) {
- return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhi_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) _mm256_setzero_si256(),
- (__mmask16) __U);
+_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhi_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -2269,72 +2081,68 @@ _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A)
+_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtepi8_epi16(__A),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtepi8_epi16(__A),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A)
+_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtepi8_epi16(__A),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A)
+_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtepi8_epi16(__A),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A)
+_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtepu8_epi16(__A),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtepu8_epi16(__A),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A)
+_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtepu8_epi16(__A),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtepu8_epi16(__A),
+ (__v16hi)_mm256_setzero_si256());
}
@@ -2461,366 +2269,328 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
(__v16hi)_mm256_setzero_si256()); })
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_sllv_epi16 (__m256i __A, __m256i __B)
+_mm256_sllv_epi16(__m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) -1);
+ return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sllv_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sllv_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sllv_epi16 (__m128i __A, __m128i __B)
+_mm_sllv_epi16(__m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_hi (),
- (__mmask8) -1);
+ return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sllv_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sllv_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sll_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sll_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
- (__v8hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sll_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
- (__v8hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sll_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
-#define _mm_mask_slli_epi16(W, U, A, B) __extension__ ({ \
- (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \
- (__v8hi)(__m128i)(W), \
- (__mmask8)(U)); })
-
-#define _mm_maskz_slli_epi16(U, A, B) __extension__ ({ \
- (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)(__m128i)(A), (int)(B), \
- (__v8hi)_mm_setzero_si128(), \
- (__mmask8)(U)); })
-
-#define _mm256_mask_slli_epi16(W, U, A, B) __extension__ ({ \
- (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \
- (__v16hi)(__m256i)(W), \
- (__mmask16)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_slli_epi16(__A, __B),
+ (__v8hi)__W);
+}
-#define _mm256_maskz_slli_epi16(U, A, B) __extension__ ({ \
- (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)(__m256i)(A), (int)(B), \
- (__v16hi)_mm256_setzero_si256(), \
- (__mmask16)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_slli_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_slli_epi16(__A, __B),
+ (__v16hi)__W);
+}
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_slli_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srlv_epi16 (__m256i __A, __m256i __B)
+_mm256_srlv_epi16(__m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) -1);
+ return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srlv_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srlv_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srlv_epi16 (__m128i __A, __m128i __B)
+_mm_srlv_epi16(__m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_hi (),
- (__mmask8) -1);
+ return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srlv_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srlv_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srav_epi16 (__m256i __A, __m256i __B)
+_mm256_srav_epi16(__m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) -1);
+ return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srav_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
- (__v16hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srav_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srav_epi16 (__m128i __A, __m128i __B)
+_mm_srav_epi16(__m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_hi (),
- (__mmask8) -1);
+ return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srav_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srav_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sra_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sra_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
- (__v8hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sra_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
- (__v8hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sra_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
-#define _mm_mask_srai_epi16(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
- (__v8hi)(__m128i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srai_epi16(__A, __B),
+ (__v8hi)__W);
+}
-#define _mm_maskz_srai_epi16(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
- (__v8hi)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srai_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
-#define _mm256_mask_srai_epi16(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
- (__v16hi)(__m256i)(W), \
- (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srai_epi16(__A, __B),
+ (__v16hi)__W);
+}
-#define _mm256_maskz_srai_epi16(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
- (__v16hi)_mm256_setzero_si256(), \
- (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srai_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srl_epi16(__A, __B),
+ (__v8hi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
- (__v8hi) __B,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srl_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
- (__v8hi) __B,
- (__v16hi) __W,
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srl_epi16(__A, __B),
+ (__v16hi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
- (__v8hi) __B,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srl_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
}
-#define _mm_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
- (__v8hi)(__m128i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srli_epi16(__A, __B),
+ (__v8hi)__W);
+}
-#define _mm_maskz_srli_epi16(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(imm), \
- (__v8hi)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srli_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
-#define _mm256_mask_srli_epi16(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
- (__v16hi)(__m256i)(W), \
- (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srli_epi16(__A, __B),
+ (__v16hi)__W);
+}
-#define _mm256_maskz_srli_epi16(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(imm), \
- (__v16hi)_mm256_setzero_si256(), \
- (__mmask16)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srli_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
@@ -3342,28 +3112,24 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
}
#define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
- (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(N), \
- (__v16qi)(__m128i)(W), \
- (__mmask16)(U)); })
+ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+ (__v16qi)(__m128i)(W)); })
#define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
- (__m128i)__builtin_ia32_palignr128_mask((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(N), \
- (__v16qi)_mm_setzero_si128(), \
- (__mmask16)(U)); })
+ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+ (__v16qi)_mm_setzero_si128()); })
#define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
- (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \
- (__v32qi)(__m256i)(B), (int)(N), \
- (__v32qi)(__m256i)(W), \
- (__mmask32)(U)); })
+ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+ (__v32qi)(__m256i)(W)); })
#define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
- (__m256i)__builtin_ia32_palignr256_mask((__v32qi)(__m256i)(A), \
- (__v32qi)(__m256i)(B), (int)(N), \
- (__v32qi)_mm256_setzero_si256(), \
- (__mmask32)(U)); })
+ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+ (__v32qi)_mm256_setzero_si256()); })
#define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \
(__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
diff --git a/lib/Headers/avx512vldqintrin.h b/lib/Headers/avx512vldqintrin.h
index 8187bcd6b28e..cd9da4370564 100644
--- a/lib/Headers/avx512vldqintrin.h
+++ b/lib/Headers/avx512vldqintrin.h
@@ -37,20 +37,17 @@ _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
- (__v4di) __B,
- (__v4di) __W,
- (__mmask8) __U);
+_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_mullo_epi64(__A, __B),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
- (__v4di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_mullo_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -59,293 +56,241 @@ _mm_mullo_epi64 (__m128i __A, __m128i __B) {
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di) __W,
- (__mmask8) __U);
+_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_mullo_epi64(__A, __B),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_mullo_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_andnot_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_andnot_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_andnot_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_andnot_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_andnot_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_andnot_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_andnot_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_andnot_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_and_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_and_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_and_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_and_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_and_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_and_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_and_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_and_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
- __m256d __B) {
- return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_xor_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_xor_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_xor_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_xor_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_xor_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_xor_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_xor_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_xor_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_or_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_or_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_or_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_or_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_or_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_or_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_or_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_or_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -1151,82 +1096,72 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
}
#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \
- (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1); })
+ (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A), \
+ (__v4df)_mm256_undefined_pd(), \
+ ((imm) & 1) ? 2 : 0, \
+ ((imm) & 1) ? 3 : 1); })
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
- (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U)); })
+ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+ (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
+ (__v2df)(W)); })
#define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
- (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U)); })
+ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+ (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
+ (__v2df)_mm_setzero_pd()); })
#define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
- (int)(imm), \
- (__v2di)_mm_setzero_di(), \
- (__mmask8)-1); })
+ (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A), \
+ (__v4di)_mm256_undefined_si256(), \
+ ((imm) & 1) ? 2 : 0, \
+ ((imm) & 1) ? 3 : 1); })
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
- (int)(imm), \
- (__v2di)(__m128i)(W), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
+ (__v2di)(W)); })
#define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
- (int)(imm), \
- (__v2di)_mm_setzero_di(), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
+ (__v2di)_mm_setzero_di()); })
#define _mm256_insertf64x2(A, B, imm) __extension__ ({ \
- (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(imm), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)-1); })
+ (__m256d)__builtin_shufflevector((__v4df)(A), \
+ (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
+ ((imm) & 0x1) ? 0 : 4, \
+ ((imm) & 0x1) ? 1 : 5, \
+ ((imm) & 0x1) ? 4 : 2, \
+ ((imm) & 0x1) ? 5 : 3); })
#define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
- (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(imm), \
- (__v4df)(__m256d)(W), \
- (__mmask8)(U)); })
+ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+ (__v4df)(W)); })
#define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
- (__m256d)__builtin_ia32_insertf64x2_256_mask((__v4df)(__m256d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(imm), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)(U)); })
+ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+ (__v4df)_mm256_setzero_pd()); })
#define _mm256_inserti64x2(A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
- (__v2di)(__m128i)(B), \
- (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)-1); })
+ (__m256i)__builtin_shufflevector((__v4di)(A), \
+ (__v4di)_mm256_castsi128_si256((__m128i)(B)), \
+ ((imm) & 0x1) ? 0 : 4, \
+ ((imm) & 0x1) ? 1 : 5, \
+ ((imm) & 0x1) ? 4 : 2, \
+ ((imm) & 0x1) ? 5 : 3); })
#define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
- (__v2di)(__m128i)(B), \
- (int)(imm), \
- (__v4di)(__m256i)(W), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+ (__v4di)(W)); })
#define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_inserti64x2_256_mask((__v4di)(__m256i)(A), \
- (__v2di)(__m128i)(B), \
- (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+ (__v4di)_mm256_setzero_si256()); })
#define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
(__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
diff --git a/lib/Headers/avx512vlintrin.h b/lib/Headers/avx512vlintrin.h
index 295ce291f7ce..f3744da6ab8a 100644
--- a/lib/Headers/avx512vlintrin.h
+++ b/lib/Headers/avx512vlintrin.h
@@ -616,277 +616,227 @@ _mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_add_epi32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_add_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
- (__v4di) __B,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_add_epi64(__A, __B),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
- (__v4di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_add_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sub_epi32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sub_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
- __m256i __B)
+_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
- (__v4di) __B,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sub_epi64(__A, __B),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
- (__v4di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sub_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_add_epi32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_add_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_add_epi64(__A, __B),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_add_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sub_epi32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sub_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sub_epi64(__A, __B),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sub_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
- __m256i __Y)
+_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v4di) __W, __M);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_mul_epi32(__X, __Y),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v4di)
- _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_mul_epi32(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
- __m128i __Y)
+_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v2di) __W, __M);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_mul_epi32(__X, __Y),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v2di)
- _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_mul_epi32(__X, __Y),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
- __m256i __Y)
+_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v4di) __W, __M);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_mul_epu32(__X, __Y),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v4di)
- _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_mul_epu32(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
- __m128i __Y)
+_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v2di) __W, __M);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_mul_epu32(__X, __Y),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v2di)
- _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_mul_epu32(__X, __Y),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_mullo_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
- __m256i __B)
+_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si) __W, __M);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_mullo_epi32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_mullo_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
+_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W, __M);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_mullo_epi32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -1895,71 +1845,59 @@ _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_add_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_add_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_add_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_add_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_add_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_add_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_add_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_add_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -2196,32 +2134,30 @@ _mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
- return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
- (__v2df) __W,
- (__mmask8) __U);
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+ (__v2df)_mm_cvtepi32_pd(__A),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
- return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+ (__v2df)_mm_cvtepi32_pd(__A),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
- return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
- (__v4df) __W,
- (__mmask8) __U);
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+ (__v4df)_mm256_cvtepi32_pd(__A),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
- return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+ (__v4df)_mm256_cvtepi32_pd(__A),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2620,48 +2556,41 @@ _mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtepu32_pd (__m128i __A) {
- return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1);
+ return (__m128d) __builtin_convertvector(
+ __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
- return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
- (__v2df) __W,
- (__mmask8) __U);
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+ (__v2df)_mm_cvtepu32_pd(__A),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
- return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+ (__v2df)_mm_cvtepu32_pd(__A),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_cvtepu32_pd (__m128i __A) {
- return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) -1);
+ return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
- return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
- (__v4df) __W,
- (__mmask8) __U);
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+ (__v4df)_mm256_cvtepu32_pd(__A),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
- return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+ (__v4df)_mm256_cvtepu32_pd(__A),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2711,72 +2640,59 @@ _mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_div_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_div_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A,
- __m256d __B) {
- return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_div_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_div_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_div_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_div_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_div_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_div_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -3127,240 +3043,199 @@ _mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_max_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_max_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A,
- __m256d __B) {
- return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_max_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_max_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_max_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_max_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_max_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_max_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_min_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_min_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A,
- __m256d __B) {
- return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_min_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_min_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_min_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_min_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_min_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_min_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_mul_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_mul_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A,
- __m256d __B) {
- return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_mul_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_mul_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_mul_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_mul_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_mul_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_mul_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
- (__v4si) __W,
- (__mmask8) __U);
+_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_abs_epi32(__A),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_abs_epi32(__A),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
- (__v8si) __W,
- (__mmask8) __U);
+_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U,
+ (__v8si)_mm256_abs_epi32(__A),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U,
+ (__v8si)_mm256_abs_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3410,37 +3285,31 @@ _mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- __M);
+_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_max_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B) {
- return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W, __M);
+_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_max_epi32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- __M);
+_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_max_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
- __m256i __B) {
- return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si) __W, __M);
+_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_max_epi32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3496,37 +3365,31 @@ _mm256_max_epi64 (__m256i __A, __m256i __B) {
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- __M);
+_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_max_epu32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B) {
- return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W, __M);
+_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_max_epu32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- __M);
+_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_max_epu32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
- __m256i __B) {
- return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si) __W, __M);
+_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_max_epu32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3582,37 +3445,31 @@ _mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- __M);
+_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_min_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B) {
- return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W, __M);
+_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_min_epi32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- __M);
+_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_min_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
- __m256i __B) {
- return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si) __W, __M);
+_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_min_epi32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3668,37 +3525,31 @@ _mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- __M);
+_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_min_epu32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B) {
- return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W, __M);
+_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_min_epu32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- __M);
+_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_min_epu32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
- __m256i __B) {
- return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
- (__v8si) __B,
- (__v8si) __W, __M);
+_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_min_epu32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -4095,132 +3946,115 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
(__v8si)(__m256i)(v1), (int)(scale)); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) {
- return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_sqrt_pd(__A),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) {
- return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_sqrt_pd(__A),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) {
- return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_sqrt_pd(__A),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) {
- return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_sqrt_pd(__A),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) {
- return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_sqrt_ps(__A),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) {
- return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_sqrt_ps(__A),
+ (__v4sf)_mm_setzero_pd());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) {
- return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_sqrt_ps(__A),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) {
- return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_sqrt_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
+_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_sub_pd(__A, __B),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_sub_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A,
- __m256d __B) {
- return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
+_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_sub_pd(__A, __B),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
+_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_sub_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
+_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_sub_ps(__A, __B),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_sub_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
+_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_sub_ps(__A, __B),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_sub_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -4551,344 +4385,324 @@ _mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepi8_epi32(__A),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepi8_epi32(__A),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepi8_epi32(__A),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepi8_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi8_epi64(__A),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi8_epi64(__A),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi8_epi64(__A),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi8_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
{
- return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi32_epi64(__X),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
{
- return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi32_epi64(__X),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
{
- return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi32_epi64(__X),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
{
- return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi32_epi64(__X),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepi16_epi32(__A),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepi16_epi32(__A),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepi16_epi32(__A),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepi16_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi16_epi64(__A),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi16_epi64(__A),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi16_epi64(__A),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi16_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepu8_epi32(__A),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepu8_epi32(__A),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepu8_epi32(__A),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepu8_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu8_epi64(__A),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu8_epi64(__A),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu8_epi64(__A),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu8_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
{
- return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu32_epi64(__X),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
{
- return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu32_epi64(__X),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
{
- return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu32_epi64(__X),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
{
- return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu32_epi64(__X),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepu16_epi32(__A),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepu16_epi32(__A),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepu16_epi32(__A),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepu16_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu16_epi64(__A),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
{
- return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu16_epi64(__A),
+ (__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu16_epi64(__A),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
{
- return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu16_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
}
@@ -5125,125 +4939,132 @@ _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
(__mmask8)(U)); })
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sll_epi32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sll_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
- (__v4si) __B,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sll_epi32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
- (__v4si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sll_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
-#define _mm_mask_slli_epi32(W, U, A, B) __extension__ ({ \
- (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_slli_epi32(__A, __B),
+ (__v4si)__W);
+}
-#define _mm_maskz_slli_epi32(U, A, B) __extension__ ({ \
- (__m128i)__builtin_ia32_pslldi128_mask((__v4si)(__m128i)(A), (int)(B), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_slli_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
-#define _mm256_mask_slli_epi32(W, U, A, B) __extension__ ({ \
- (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_slli_epi32(__A, __B),
+ (__v8si)__W);
+}
-#define _mm256_maskz_slli_epi32(U, A, B) __extension__ ({ \
- (__m256i)__builtin_ia32_pslldi256_mask((__v8si)(__m256i)(A), (int)(B), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_slli_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sll_epi64(__A, __B),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sll_epi64(__A, __B),
+ (__v2di)_mm_setzero_di());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
- (__v2di) __B,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sll_epi64(__A, __B),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
- (__v2di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sll_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
}
-#define _mm_mask_slli_epi64(W, U, A, B) __extension__ ({ \
- (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \
- (__v2di)(__m128i)(W), \
- (__mmask8)(U)); })
-
-#define _mm_maskz_slli_epi64(U, A, B) __extension__ ({ \
- (__m128i)__builtin_ia32_psllqi128_mask((__v2di)(__m128i)(A), (int)(B), \
- (__v2di)_mm_setzero_di(), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_slli_epi64(__A, __B),
+ (__v2di)__W);
+}
-#define _mm256_mask_slli_epi64(W, U, A, B) __extension__ ({ \
- (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \
- (__v4di)(__m256i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_slli_epi64(__A, __B),
+ (__v2di)_mm_setzero_di());
+}
-#define _mm256_maskz_slli_epi64(U, A, B) __extension__ ({ \
- (__m256i)__builtin_ia32_psllqi256_mask((__v4di)(__m256i)(A), (int)(B), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_slli_epi64(__A, __B),
+ (__v4di)__W);
+}
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_slli_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_rorv_epi32 (__m128i __A, __m128i __B)
@@ -5366,387 +5187,335 @@ _mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
- __m128i __Y)
+_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
- (__v2di) __Y,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sllv_epi64(__X, __Y),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
- (__v2di) __Y,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sllv_epi64(__X, __Y),
+ (__v2di)_mm_setzero_di());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
- __m256i __Y)
+_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
- (__v4di) __Y,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sllv_epi64(__X, __Y),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
- (__v4di) __Y,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sllv_epi64(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
- __m128i __Y)
+_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sllv_epi32(__X, __Y),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sllv_epi32(__X, __Y),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
- __m256i __Y)
+_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sllv_epi32(__X, __Y),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sllv_epi32(__X, __Y),
+ (__v8si)_mm256_setzero_si256());
}
-
-
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
- __m128i __Y)
+_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
- (__v2di) __Y,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srlv_epi64(__X, __Y),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
- (__v2di) __Y,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srlv_epi64(__X, __Y),
+ (__v2di)_mm_setzero_di());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
- __m256i __Y)
+_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
- (__v4di) __Y,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srlv_epi64(__X, __Y),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
- (__v4di) __Y,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srlv_epi64(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
- __m128i __Y)
+_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srlv_epi32(__X, __Y),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srlv_epi32(__X, __Y),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
- __m256i __Y)
+_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srlv_epi32(__X, __Y),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srlv_epi32(__X, __Y),
+ (__v8si)_mm256_setzero_si256());
}
-
-
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srl_epi32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srl_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
- (__v4si) __B,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srl_epi32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
- (__v4si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srl_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
-#define _mm_mask_srli_epi32(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srli_epi32(__A, __B),
+ (__v4si)__W);
+}
-#define _mm_maskz_srli_epi32(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srli_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
-#define _mm256_mask_srli_epi32(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srli_epi32(__A, __B),
+ (__v8si)__W);
+}
-#define _mm256_maskz_srli_epi32(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(imm), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srli_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srl_epi64(__A, __B),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srl_epi64(__A, __B),
+ (__v2di)_mm_setzero_di());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
- (__v2di) __B,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srl_epi64(__A, __B),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
- (__v2di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srl_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
}
-#define _mm_mask_srli_epi64(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
- (__v2di)(__m128i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srli_epi64(__A, __B),
+ (__v2di)__W);
+}
-#define _mm_maskz_srli_epi64(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
- (__v2di)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srli_epi64(__A, __B),
+ (__v2di)_mm_setzero_di());
+}
-#define _mm256_mask_srli_epi64(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
- (__v4di)(__m256i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srli_epi64(__A, __B),
+ (__v4di)__W);
+}
-#define _mm256_maskz_srli_epi64(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srli_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
- __m128i __Y)
+_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srav_epi32(__X, __Y),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
- (__v4si) __Y,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srav_epi32(__X, __Y),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
- __m256i __Y)
+_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srav_epi32(__X, __Y),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
- (__v8si) __Y,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srav_epi32(__X, __Y),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srav_epi64 (__m128i __X, __m128i __Y)
+_mm_srav_epi64(__m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
- (__v2di) __Y,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) -1);
+ return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
- __m128i __Y)
+_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
- (__v2di) __Y,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srav_epi64(__X, __Y),
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
- (__v2di) __Y,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srav_epi64(__X, __Y),
+ (__v2di)_mm_setzero_di());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_srav_epi64 (__m256i __X, __m256i __Y)
+_mm256_srav_epi64(__m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
- (__v4di) __Y,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) -1);
+ return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
- __m256i __Y)
+_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
- (__v4di) __Y,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srav_epi64(__X, __Y),
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
{
- return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
- (__v4di) __Y,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srav_epi64(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -5975,6 +5744,7 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
(__v8si)_mm256_setzero_si256(), \
(__mmask8)(M)); })
+#ifdef __x86_64__
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
{
@@ -6006,6 +5776,7 @@ _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
_mm256_setzero_si256 (),
__M);
}
+#endif
#define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \
(__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
@@ -6653,85 +6424,67 @@ _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
(__v8sf)_mm256_setzero_ps()); })
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
- __m128i __C)
+_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
{
- return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
- (__v2di) __C,
- (__v2df) __W,
- (__mmask8) __U);
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_permutevar_pd(__A, __C),
+ (__v2df)__W);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
+_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C)
{
- return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
- (__v2di) __C,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_permutevar_pd(__A, __C),
+ (__v2df)_mm_setzero_pd());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
- __m256i __C)
+_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C)
{
- return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
- (__v4di) __C,
- (__v4df) __W,
- (__mmask8)
- __U);
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_permutevar_pd(__A, __C),
+ (__v4df)__W);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
+_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C)
{
- return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
- (__v4di) __C,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8)
- __U);
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_permutevar_pd(__A, __C),
+ (__v4df)_mm256_setzero_pd());
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
- __m128i __C)
+_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C)
{
- return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
- (__v4si) __C,
- (__v4sf) __W,
- (__mmask8) __U);
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_permutevar_ps(__A, __C),
+ (__v4sf)__W);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
+_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C)
{
- return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
- (__v4si) __C,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_permutevar_ps(__A, __C),
+ (__v4sf)_mm_setzero_ps());
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
- __m256i __C)
+_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C)
{
- return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
- (__v8si) __C,
- (__v8sf) __W,
- (__mmask8) __U);
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_permutevar_ps(__A, __C),
+ (__v8sf)__W);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
+_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C)
{
- return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
- (__v8si) __C,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_permutevar_ps(__A, __C),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __mmask8 __DEFAULT_FN_ATTRS
@@ -6985,154 +6738,156 @@ _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sra_epi32(__A, __B),
+ (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
- (__v4si) __B,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sra_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
- (__v4si) __B,
- (__v8si) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sra_epi32(__A, __B),
+ (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
- (__v4si) __B,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sra_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
}
-#define _mm_mask_srai_epi32(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srai_epi32(__A, __B),
+ (__v4si)__W);
+}
-#define _mm_maskz_srai_epi32(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srai_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
-#define _mm256_mask_srai_epi32(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srai_epi32(__A, __B),
+ (__v8si)__W);
+}
-#define _mm256_maskz_srai_epi32(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(imm), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srai_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sra_epi64 (__m128i __A, __m128i __B)
+_mm_sra_epi64(__m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) -1);
+ return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
+_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di) __W,
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+ (__v2di)_mm_sra_epi64(__A, __B), \
+ (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) __U);
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+ (__v2di)_mm_sra_epi64(__A, __B), \
+ (__v2di)_mm_setzero_di());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_sra_epi64 (__m256i __A, __m128i __B)
+_mm256_sra_epi64(__m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
- (__v2di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) -1);
+ return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
- __m128i __B)
+_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
- (__v2di) __B,
- (__v4di) __W,
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm256_sra_epi64(__A, __B), \
+ (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
{
- return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
- (__v2di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm256_sra_epi64(__A, __B), \
+ (__v4di)_mm256_setzero_si256());
}
-#define _mm_srai_epi64(A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
- (__v2di)_mm_setzero_di(), \
- (__mmask8)-1); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi64(__m128i __A, int __imm)
+{
+ return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
+}
-#define _mm_mask_srai_epi64(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
- (__v2di)(__m128i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+ (__v2di)_mm_srai_epi64(__A, __imm), \
+ (__v2di)__W);
+}
-#define _mm_maskz_srai_epi64(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(imm), \
- (__v2di)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+ (__v2di)_mm_srai_epi64(__A, __imm), \
+ (__v2di)_mm_setzero_di());
+}
-#define _mm256_srai_epi64(A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)-1); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi64(__m256i __A, int __imm)
+{
+ return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
+}
-#define _mm256_mask_srai_epi64(W, U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
- (__v4di)(__m256i)(W), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm256_srai_epi64(__A, __imm), \
+ (__v4di)__W);
+}
-#define _mm256_maskz_srai_epi64(U, A, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm256_srai_epi64(__A, __imm), \
+ (__v4di)_mm256_setzero_si256());
+}
#define _mm_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
(__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
@@ -8473,79 +8228,84 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
}
#define _mm256_extractf32x4_ps(A, imm) __extension__ ({ \
- (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
- (int)(imm), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1); })
+ (__m128)__builtin_shufflevector((__v8sf)(__m256)(A), \
+ (__v8sf)_mm256_undefined_ps(), \
+ ((imm) & 1) ? 4 : 0, \
+ ((imm) & 1) ? 5 : 1, \
+ ((imm) & 1) ? 6 : 2, \
+ ((imm) & 1) ? 7 : 3); })
#define _mm256_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({ \
- (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
- (int)(imm), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U)); })
+ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
+ (__v4sf)(W)); })
#define _mm256_maskz_extractf32x4_ps(U, A, imm) __extension__ ({ \
- (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
- (int)(imm), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)); })
+ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
+ (__v4sf)_mm_setzero_ps()); })
#define _mm256_extracti32x4_epi32(A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
- (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)-1); })
+ (__m128i)__builtin_shufflevector((__v8si)(__m256)(A), \
+ (__v8si)_mm256_undefined_si256(), \
+ ((imm) & 1) ? 4 : 0, \
+ ((imm) & 1) ? 5 : 1, \
+ ((imm) & 1) ? 6 : 2, \
+ ((imm) & 1) ? 7 : 3); })
#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
- (int)(imm), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
+ (__v4si)(W)); })
#define _mm256_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
- (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
+ (__v4si)_mm_setzero_si128()); })
#define _mm256_insertf32x4(A, B, imm) __extension__ ({ \
- (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1); })
+ (__m256)__builtin_shufflevector((__v8sf)(A), \
+ (__v8sf)_mm256_castps128_ps256((__m128)(B)), \
+ ((imm) & 0x1) ? 0 : 8, \
+ ((imm) & 0x1) ? 1 : 9, \
+ ((imm) & 0x1) ? 2 : 10, \
+ ((imm) & 0x1) ? 3 : 11, \
+ ((imm) & 0x1) ? 8 : 4, \
+ ((imm) & 0x1) ? 9 : 5, \
+ ((imm) & 0x1) ? 10 : 6, \
+ ((imm) & 0x1) ? 11 : 7); })
#define _mm256_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
- (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v8sf)(__m256)(W), \
- (__mmask8)(U)); })
+ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+ (__v8sf)(W)); })
#define _mm256_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
- (__m256)__builtin_ia32_insertf32x4_256_mask((__v8sf)(__m256)(A), \
- (__v4sf)(__m128)(B), (int)(imm), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U)); })
+ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+ (__v8sf)_mm256_setzero_ps()); })
#define _mm256_inserti32x4(A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
- (__v4si)(__m128i)(B), \
- (int)(imm), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)-1); })
+ (__m256i)__builtin_shufflevector((__v8si)(A), \
+ (__v8si)_mm256_castsi128_si256((__m128i)(B)), \
+ ((imm) & 0x1) ? 0 : 8, \
+ ((imm) & 0x1) ? 1 : 9, \
+ ((imm) & 0x1) ? 2 : 10, \
+ ((imm) & 0x1) ? 3 : 11, \
+ ((imm) & 0x1) ? 8 : 4, \
+ ((imm) & 0x1) ? 9 : 5, \
+ ((imm) & 0x1) ? 10 : 6, \
+ ((imm) & 0x1) ? 11 : 7); })
#define _mm256_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
- (__v4si)(__m128i)(B), \
- (int)(imm), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+ (__v8si)(W)); })
#define _mm256_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_inserti32x4_256_mask((__v8si)(__m256i)(A), \
- (__v4si)(__m128i)(B), \
- (int)(imm), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+ (__v8si)_mm256_setzero_si256()); })
#define _mm_getmant_pd(A, B, C) __extension__({\
(__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
@@ -8860,76 +8620,78 @@ _mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
}
#define _mm_alignr_epi32(A, B, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v4si)_mm_undefined_si128(), \
- (__mmask8)-1); })
+ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(B), \
+ (__v4si)(__m128i)(A), \
+ ((int)(imm) & 0x3) + 0, \
+ ((int)(imm) & 0x3) + 1, \
+ ((int)(imm) & 0x3) + 2, \
+ ((int)(imm) & 0x3) + 3); })
#define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+ (__v4si)(__m128i)(W)); })
#define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+ (__v4si)_mm_setzero_si128()); })
#define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), (int)(imm), \
- (__v8si)_mm256_undefined_si256(), \
- (__mmask8)-1); })
+ (__m256i)__builtin_shufflevector((__v8si)(__m256i)(B), \
+ (__v8si)(__m256i)(A), \
+ ((int)(imm) & 0x7) + 0, \
+ ((int)(imm) & 0x7) + 1, \
+ ((int)(imm) & 0x7) + 2, \
+ ((int)(imm) & 0x7) + 3, \
+ ((int)(imm) & 0x7) + 4, \
+ ((int)(imm) & 0x7) + 5, \
+ ((int)(imm) & 0x7) + 6, \
+ ((int)(imm) & 0x7) + 7); })
#define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), (int)(imm), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+ (__v8si)(__m256i)(W)); })
#define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), (int)(imm), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+ (__v8si)_mm256_setzero_si256()); })
#define _mm_alignr_epi64(A, B, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), (int)(imm), \
- (__v2di)_mm_setzero_di(), \
- (__mmask8)-1); })
+ (__m128i)__builtin_shufflevector((__v2di)(__m128i)(B), \
+ (__v2di)(__m128i)(A), \
+ ((int)(imm) & 0x1) + 0, \
+ ((int)(imm) & 0x1) + 1); })
#define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), (int)(imm), \
- (__v2di)(__m128i)(W), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+ (__v2di)(__m128i)(W)); })
#define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), (int)(imm), \
- (__v2di)_mm_setzero_di(), \
- (__mmask8)(U)); })
+ (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+ (__v2di)_mm_setzero_di()); })
#define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v4di)_mm256_undefined_pd(), \
- (__mmask8)-1); })
+ (__m256i)__builtin_shufflevector((__v4di)(__m256i)(B), \
+ (__v4di)(__m256i)(A), \
+ ((int)(imm) & 0x3) + 0, \
+ ((int)(imm) & 0x3) + 1, \
+ ((int)(imm) & 0x3) + 2, \
+ ((int)(imm) & 0x3) + 3); })
#define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v4di)(__m256i)(W), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+ (__v4di)(__m256i)(W)); })
#define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
- (__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)(U)); })
+ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+ (__v4di)_mm256_setzero_si256()); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h
index 32e8546817b3..be03ba346031 100644
--- a/lib/Headers/avxintrin.h
+++ b/lib/Headers/avxintrin.h
@@ -57,7 +57,7 @@ typedef long long __m256i __attribute__((__vector_size__(32)));
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
+/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the source operands.
@@ -75,7 +75,7 @@ _mm256_add_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
+/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the source operands.
@@ -93,7 +93,7 @@ _mm256_add_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
+/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing the minuend.
@@ -111,7 +111,7 @@ _mm256_sub_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
+/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing the minuend.
@@ -130,7 +130,7 @@ _mm256_sub_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing the left source operand.
@@ -149,7 +149,7 @@ _mm256_addsub_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing the left source operand.
@@ -167,7 +167,7 @@ _mm256_addsub_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
+/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing the dividend.
@@ -185,7 +185,7 @@ _mm256_div_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
+/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing the dividend.
@@ -204,7 +204,7 @@ _mm256_div_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
+/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the operands.
@@ -223,7 +223,7 @@ _mm256_max_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
+/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the operands.
@@ -242,7 +242,7 @@ _mm256_max_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
+/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the operands.
@@ -261,7 +261,7 @@ _mm256_min_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
+/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the operands.
@@ -279,7 +279,7 @@ _mm256_min_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
+/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the operands.
@@ -297,7 +297,7 @@ _mm256_mul_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
+/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the operands.
@@ -316,7 +316,7 @@ _mm256_mul_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
+/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double].
@@ -333,7 +333,7 @@ _mm256_sqrt_pd(__m256d __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
+/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float].
@@ -350,7 +350,7 @@ _mm256_sqrt_ps(__m256 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
+/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float].
@@ -367,7 +367,7 @@ _mm256_rsqrt_ps(__m256 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
+/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float].
@@ -389,24 +389,24 @@ _mm256_rcp_ps(__m256 __a)
/// __m256d _mm256_round_pd(__m256d V, const int M);
/// \endcode
///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
///
/// \param V
/// A 256-bit vector of [4 x double].
/// \param M
-/// An integer value that specifies the rounding operation.
-/// Bits [7:4] are reserved.
-/// Bit [3] is a precision exception value:
-/// 0: A normal PE exception is used.
-/// 1: The PE field is not updated.
-/// Bit [2] is the rounding control source:
-/// 0: Use bits [1:0] of M.
-/// 1: Use the current MXCSR setting.
-/// Bits [1:0] contain the rounding control definition:
-/// 00: Nearest.
-/// 01: Downward (toward negative infinity).
-/// 10: Upward (toward positive infinity).
-/// 11: Truncated.
+/// An integer value that specifies the rounding operation. \n
+/// Bits [7:4] are reserved. \n
+/// Bit [3] is a precision exception value: \n
+/// 0: A normal PE exception is used. \n
+/// 1: The PE field is not updated. \n
+/// Bit [2] is the rounding control source: \n
+/// 0: Use bits [1:0] of \a M. \n
+/// 1: Use the current MXCSR setting. \n
+/// Bits [1:0] contain the rounding control definition: \n
+/// 00: Nearest. \n
+/// 01: Downward (toward negative infinity). \n
+/// 10: Upward (toward positive infinity). \n
+/// 11: Truncated.
/// \returns A 256-bit vector of [4 x double] containing the rounded values.
#define _mm256_round_pd(V, M) __extension__ ({ \
(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
@@ -421,24 +421,24 @@ _mm256_rcp_ps(__m256 __a)
/// __m256 _mm256_round_ps(__m256 V, const int M);
/// \endcode
///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
///
/// \param V
/// A 256-bit vector of [8 x float].
/// \param M
-/// An integer value that specifies the rounding operation.
-/// Bits [7:4] are reserved.
-/// Bit [3] is a precision exception value:
-/// 0: A normal PE exception is used.
-/// 1: The PE field is not updated.
-/// Bit [2] is the rounding control source:
-/// 0: Use bits [1:0] of M.
-/// 1: Use the current MXCSR setting.
-/// Bits [1:0] contain the rounding control definition:
-/// 00: Nearest.
-/// 01: Downward (toward negative infinity).
-/// 10: Upward (toward positive infinity).
-/// 11: Truncated.
+/// An integer value that specifies the rounding operation. \n
+/// Bits [7:4] are reserved. \n
+/// Bit [3] is a precision exception value: \n
+/// 0: A normal PE exception is used. \n
+/// 1: The PE field is not updated. \n
+/// Bit [2] is the rounding control source: \n
+/// 0: Use bits [1:0] of \a M. \n
+/// 1: Use the current MXCSR setting. \n
+/// Bits [1:0] contain the rounding control definition: \n
+/// 00: Nearest. \n
+/// 01: Downward (toward negative infinity). \n
+/// 10: Upward (toward positive infinity). \n
+/// 11: Truncated.
/// \returns A 256-bit vector of [8 x float] containing the rounded values.
#define _mm256_round_ps(V, M) __extension__ ({ \
(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
@@ -453,7 +453,7 @@ _mm256_rcp_ps(__m256 __a)
/// __m256d _mm256_ceil_pd(__m256d V);
/// \endcode
///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
///
/// \param V
/// A 256-bit vector of [4 x double].
@@ -470,7 +470,7 @@ _mm256_rcp_ps(__m256 __a)
/// __m256d _mm256_floor_pd(__m256d V);
/// \endcode
///
-/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
///
/// \param V
/// A 256-bit vector of [4 x double].
@@ -488,7 +488,7 @@ _mm256_rcp_ps(__m256 __a)
/// __m256 _mm256_ceil_ps(__m256 V);
/// \endcode
///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
///
/// \param V
/// A 256-bit vector of [8 x float].
@@ -505,7 +505,7 @@ _mm256_rcp_ps(__m256 __a)
/// __m256 _mm256_floor_ps(__m256 V);
/// \endcode
///
-/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
///
/// \param V
/// A 256-bit vector of [8 x float].
@@ -517,7 +517,7 @@ _mm256_rcp_ps(__m256 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
+/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the source operands.
@@ -535,7 +535,7 @@ _mm256_and_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
+/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the source operands.
@@ -554,7 +554,7 @@ _mm256_and_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
+/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing the left source operand. The
@@ -575,7 +575,7 @@ _mm256_andnot_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
+/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing the left source operand. The
@@ -595,7 +595,7 @@ _mm256_andnot_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
+/// This intrinsic corresponds to the <c> VORPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the source operands.
@@ -613,7 +613,7 @@ _mm256_or_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
+/// This intrinsic corresponds to the <c> VORPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the source operands.
@@ -631,7 +631,7 @@ _mm256_or_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
+/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the source operands.
@@ -649,7 +649,7 @@ _mm256_xor_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the source operands.
@@ -669,7 +669,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the source operands.
@@ -692,7 +692,7 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the source operands.
@@ -715,7 +715,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double] containing one of the source operands.
@@ -738,7 +738,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float] containing one of the source operands.
@@ -762,23 +762,23 @@ _mm256_hsub_ps(__m256 __a, __m256 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double].
/// \param __c
/// A 128-bit integer vector operand specifying how the values are to be
-/// copied.
-/// Bit [1]:
-/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
-/// returned vector.
-/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
-/// returned vector.
-/// Bit [65]:
-/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
-/// returned vector.
-/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
-/// returned vector.
+/// copied. \n
+/// Bit [1]: \n
+/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+/// vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
+/// returned vector. \n
+/// Bit [65]: \n
+/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
+/// returned vector.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
static __inline __m128d __DEFAULT_FN_ATTRS
_mm_permutevar_pd(__m128d __a, __m128i __c)
@@ -786,37 +786,37 @@ _mm_permutevar_pd(__m128d __a, __m128i __c)
return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
}
-/// \brief Copies the values in a 256-bit vector of [4 x double] as
-/// specified by the 256-bit integer vector operand.
+/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
+/// by the 256-bit integer vector operand.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double].
/// \param __c
/// A 256-bit integer vector operand specifying how the values are to be
-/// copied.
-/// Bit [1]:
-/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
-/// returned vector.
-/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
-/// returned vector.
-/// Bit [65]:
-/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
-/// returned vector.
-/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
-/// returned vector.
-/// Bit [129]:
-/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
-/// returned vector.
-/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
-/// returned vector.
-/// Bit [193]:
-/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
-/// returned vector.
-/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
+/// copied. \n
+/// Bit [1]: \n
+/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+/// vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
+/// returned vector. \n
+/// Bit [65]: \n
+/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// Bit [129]: \n
+/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
+/// returned vector. \n
+/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
+/// returned vector. \n
+/// Bit [193]: \n
+/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
+/// returned vector. \n
+/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
/// returned vector.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
static __inline __m256d __DEFAULT_FN_ATTRS
@@ -827,52 +827,51 @@ _mm256_permutevar_pd(__m256d __a, __m256i __c)
/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
/// specified by the 128-bit integer vector operand.
-///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
/// \param __c
/// A 128-bit integer vector operand specifying how the values are to be
-/// copied.
-/// Bits [1:0]:
-/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// Bits [33:32]:
-/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// Bits [65:64]:
-/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// Bits [97:96]:
-/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
-/// returned vector.
+/// copied. \n
+/// Bits [1:0]: \n
+/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// Bits [33:32]: \n
+/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// Bits [65:64]: \n
+/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// Bits [97:96]: \n
+/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
+/// returned vector.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
static __inline __m128 __DEFAULT_FN_ATTRS
_mm_permutevar_ps(__m128 __a, __m128i __c)
@@ -885,85 +884,85 @@ _mm_permutevar_ps(__m128 __a, __m128i __c)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float].
/// \param __c
/// A 256-bit integer vector operand specifying how the values are to be
-/// copied.
-/// Bits [1:0]:
-/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// Bits [33:32]:
-/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// Bits [65:64]:
-/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// Bits [97:96]:
-/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// Bits [129:128]:
-/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
-/// returned vector.
-/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
-/// returned vector.
-/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
-/// returned vector.
-/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
-/// returned vector.
-/// Bits [161:160]:
-/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
-/// returned vector.
-/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
-/// returned vector.
-/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
-/// returned vector.
-/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
-/// returned vector.
-/// Bits [193:192]:
-/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
-/// returned vector.
-/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
-/// returned vector.
-/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
-/// returned vector.
-/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
-/// returned vector.
-/// Bits [225:224]:
-/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
-/// returned vector.
-/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
-/// returned vector.
-/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
-/// returned vector.
-/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
-/// returned vector.
+/// copied. \n
+/// Bits [1:0]: \n
+/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// Bits [33:32]: \n
+/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// Bits [65:64]: \n
+/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// Bits [97:96]: \n
+/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// Bits [129:128]: \n
+/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// Bits [161:160]: \n
+/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// Bits [193:192]: \n
+/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// Bits [225:224]: \n
+/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
+/// returned vector.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_permutevar_ps(__m256 __a, __m256i __c)
@@ -971,8 +970,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
}
-/// \brief Copies the values in a 128-bit vector of [2 x double] as
-/// specified by the immediate integer operand.
+/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
+/// by the immediate integer operand.
///
/// \headerfile <x86intrin.h>
///
@@ -980,30 +979,31 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m128d _mm_permute_pd(__m128d A, const int C);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
///
/// \param A
/// A 128-bit vector of [2 x double].
/// \param C
-/// An immediate integer operand specifying how the values are to be copied.
-/// Bit [0]:
-/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
-/// returned vector.
-/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
-/// returned vector.
-/// Bit [1]:
-/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
-/// returned vector.
-/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
-/// returned vector.
+/// An immediate integer operand specifying how the values are to be
+/// copied. \n
+/// Bit [0]: \n
+/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+/// vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
+/// returned vector. \n
+/// Bit [1]: \n
+/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
+/// returned vector.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_permute_pd(A, C) __extension__ ({ \
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
(__v2df)_mm_undefined_pd(), \
((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
-/// \brief Copies the values in a 256-bit vector of [4 x double] as
-/// specified by the immediate integer operand.
+/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
+/// the immediate integer operand.
///
/// \headerfile <x86intrin.h>
///
@@ -1011,32 +1011,33 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m256d _mm256_permute_pd(__m256d A, const int C);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
///
/// \param A
/// A 256-bit vector of [4 x double].
/// \param C
-/// An immediate integer operand specifying how the values are to be copied.
-/// Bit [0]:
-/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
-/// returned vector.
-/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
-/// returned vector.
-/// Bit [1]:
-/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
-/// returned vector.
-/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
-/// returned vector.
-/// Bit [2]:
-/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
-/// returned vector.
-/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
-/// returned vector.
-/// Bit [3]:
-/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
-/// returned vector.
-/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
-/// returned vector.
+/// An immediate integer operand specifying how the values are to be
+/// copied. \n
+/// Bit [0]: \n
+/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+/// vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
+/// returned vector. \n
+/// Bit [1]: \n
+/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// Bit [2]: \n
+/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
+/// returned vector. \n
+/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
+/// returned vector. \n
+/// Bit [3]: \n
+/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
+/// returned vector. \n
+/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
+/// returned vector.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute_pd(A, C) __extension__ ({ \
(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
@@ -1046,8 +1047,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
2 + (((C) >> 2) & 0x1), \
2 + (((C) >> 3) & 0x1)); })
-/// \brief Copies the values in a 128-bit vector of [4 x float] as
-/// specified by the immediate integer operand.
+/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
+/// the immediate integer operand.
///
/// \headerfile <x86intrin.h>
///
@@ -1055,48 +1056,49 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m128 _mm_permute_ps(__m128 A, const int C);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
///
/// \param A
/// A 128-bit vector of [4 x float].
/// \param C
-/// An immediate integer operand specifying how the values are to be copied.
-/// Bits [1:0]:
-/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// Bits [3:2]:
-/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// Bits [5:4]:
-/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// Bits [7:6]:
-/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
-/// returned vector.
+/// An immediate integer operand specifying how the values are to be
+/// copied. \n
+/// Bits [1:0]: \n
+/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// Bits [3:2]: \n
+/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// Bits [5:4]: \n
+/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// Bits [7:6]: \n
+/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
+/// returned vector.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_permute_ps(A, C) __extension__ ({ \
(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
@@ -1104,8 +1106,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
-/// \brief Copies the values in a 256-bit vector of [8 x float] as
-/// specified by the immediate integer operand.
+/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
+/// the immediate integer operand.
///
/// \headerfile <x86intrin.h>
///
@@ -1113,84 +1115,85 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m256 _mm256_permute_ps(__m256 A, const int C);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
///
/// \param A
/// A 256-bit vector of [8 x float].
/// \param C
-/// An immediate integer operand specifying how the values are to be copied.
-/// Bits [1:0]:
-/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
-/// returned vector.
-/// Bits [3:2]:
-/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
-/// returned vector.
-/// Bits [5:4]:
-/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
-/// returned vector.
-/// Bits [7:6]:
-/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// Bits [1:0]:
-/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
-/// returned vector.
-/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
-/// returned vector.
-/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
-/// returned vector.
-/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
-/// returned vector.
-/// Bits [3:2]:
-/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
-/// returned vector.
-/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
-/// returned vector.
-/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
-/// returned vector.
-/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
-/// returned vector.
-/// Bits [5:4]:
-/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
-/// returned vector.
-/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
-/// returned vector.
-/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
-/// returned vector.
-/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
-/// returned vector.
-/// Bits [7:6]:
-/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
-/// returned vector.
-/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
-/// returned vector.
-/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
-/// returned vector.
-/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
-/// returned vector.
+/// An immediate integer operand specifying how the values are to be \n
+/// copied. \n
+/// Bits [1:0]: \n
+/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// Bits [3:2]: \n
+/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// Bits [5:4]: \n
+/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// Bits [7:6]: \n
+/// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// Bits [1:0]: \n
+/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// Bits [3:2]: \n
+/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// Bits [5:4]: \n
+/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// Bits [7:6]: \n
+/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
+/// returned vector.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute_ps(A, C) __extension__ ({ \
(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
@@ -1213,7 +1216,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
///
/// \param V1
/// A 256-bit vector of [4 x double].
@@ -1221,25 +1224,25 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// A 256-bit vector of [4 x double.
/// \param M
/// An immediate integer operand specifying how the values are to be
-/// permuted.
-/// Bits [1:0]:
-/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-/// destination.
-/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-/// destination.
-/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-/// destination.
-/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-/// destination.
-/// Bits [5:4]:
-/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-/// destination.
-/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-/// destination.
-/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-/// destination.
-/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
-/// destination.
+/// permuted. \n
+/// Bits [1:0]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// Bits [5:4]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+/// destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
@@ -1254,7 +1257,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
///
/// \param V1
/// A 256-bit vector of [8 x float].
@@ -1262,24 +1265,24 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// A 256-bit vector of [8 x float].
/// \param M
/// An immediate integer operand specifying how the values are to be
-/// permuted.
-/// Bits [1:0]:
-/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-/// destination.
-/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-/// destination.
-/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-/// destination.
-/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-/// destination.
-/// Bits [5:4]:
-/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-/// destination.
-/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-/// destination.
-/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-/// destination.
-/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+/// permuted. \n
+/// Bits [1:0]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// Bits [5:4]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
/// destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
@@ -1295,7 +1298,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPERM2F128 / PERM2F128 instruction.
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
///
/// \param V1
/// A 256-bit integer vector.
@@ -1303,23 +1306,23 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// A 256-bit integer vector.
/// \param M
/// An immediate integer operand specifying how the values are to be copied.
-/// Bits [1:0]:
-/// 00: Bits [127:0] of operand V1 are copied to bits [127:0] of the
-/// destination.
-/// 01: Bits [255:128] of operand V1 are copied to bits [127:0] of the
-/// destination.
-/// 10: Bits [127:0] of operand V2 are copied to bits [127:0] of the
-/// destination.
-/// 11: Bits [255:128] of operand V2 are copied to bits [127:0] of the
-/// destination.
-/// Bits [5:4]:
-/// 00: Bits [127:0] of operand V1 are copied to bits [255:128] of the
-/// destination.
-/// 01: Bits [255:128] of operand V1 are copied to bits [255:128] of the
-/// destination.
-/// 10: Bits [127:0] of operand V2 are copied to bits [255:128] of the
-/// destination.
-/// 11: Bits [255:128] of operand V2 are copied to bits [255:128] of the
+/// Bits [1:0]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// Bits [5:4]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
/// destination.
/// \returns A 256-bit integer vector containing the copied values.
#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
@@ -1337,7 +1340,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
/// \endcode
///
-/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction.
+/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
///
/// \param V1
/// A 256-bit vector of [4 x double].
@@ -1347,9 +1350,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// An immediate integer operand, with mask bits [3:0] specifying how the
/// values are to be copied. The position of the mask bit corresponds to the
/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
-/// element in operand V1 is copied to the same position in the destination.
-/// When a mask bit is 1, the corresponding 64-bit element in operand V2 is
-/// copied to the same position in the destination.
+/// element in operand \a V1 is copied to the same position in the
+/// destination. When a mask bit is 1, the corresponding 64-bit element in
+/// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
@@ -1369,7 +1372,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
/// \endcode
///
-/// This intrinsic corresponds to the \c VBLENDPS / BLENDPS instruction.
+/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
///
/// \param V1
/// A 256-bit vector of [8 x float].
@@ -1379,9 +1382,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// An immediate integer operand, with mask bits [7:0] specifying how the
/// values are to be copied. The position of the mask bit corresponds to the
/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
-/// element in operand V1 is copied to the same position in the destination.
-/// When a mask bit is 1, the corresponding 32-bit element in operand V2 is
-/// copied to the same position in the destination.
+/// element in operand \a V1 is copied to the same position in the
+/// destination. When a mask bit is 1, the corresponding 32-bit element in
+/// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
@@ -1401,7 +1404,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VBLENDVPD / BLENDVPD instruction.
+/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double].
@@ -1411,9 +1414,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
/// how the values are to be copied. The position of the mask bit corresponds
/// to the most significant bit of a copied value. When a mask bit is 0, the
-/// corresponding 64-bit element in operand __a is copied to the same
+/// corresponding 64-bit element in operand \a __a is copied to the same
/// position in the destination. When a mask bit is 1, the corresponding
-/// 64-bit element in operand __b is copied to the same position in the
+/// 64-bit element in operand \a __b is copied to the same position in the
/// destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
static __inline __m256d __DEFAULT_FN_ATTRS
@@ -1429,7 +1432,7 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VBLENDVPS / BLENDVPS instruction.
+/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float].
@@ -1439,9 +1442,9 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
/// and 31 specifying how the values are to be copied. The position of the
/// mask bit corresponds to the most significant bit of a copied value. When
-/// a mask bit is 0, the corresponding 32-bit element in operand __a is
+/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
/// copied to the same position in the destination. When a mask bit is 1, the
-/// corresponding 32-bit element in operand __b is copied to the same
+/// corresponding 32-bit element in operand \a __b is copied to the same
/// position in the destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
static __inline __m256 __DEFAULT_FN_ATTRS
@@ -1455,12 +1458,12 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \brief Computes two dot products in parallel, using the lower and upper
/// halves of two [8 x float] vectors as input to the two computations, and
/// returning the two dot products in the lower and upper halves of the
-/// [8 x float] result. The immediate integer operand controls which
-/// input elements will contribute to the dot product, and where the final
-/// results are returned. In general, for each dot product, the four
-/// corresponding elements of the input vectors are multiplied; the first
-/// two and second two products are summed, then the two sums are added to
-/// form the final result.
+/// [8 x float] result. The immediate integer operand controls which input
+/// elements will contribute to the dot product, and where the final results
+/// are returned. In general, for each dot product, the four corresponding
+/// elements of the input vectors are multiplied; the first two and second
+/// two products are summed, then the two sums are added to form the final
+/// result.
///
/// \headerfile <x86intrin.h>
///
@@ -1468,7 +1471,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
/// \endcode
///
-/// This intrinsic corresponds to the \c VDPPS / DPPS instruction.
+/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
///
/// \param V1
/// A vector of [8 x float] values, treated as two [4 x float] vectors.
@@ -1510,7 +1513,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
/// \endcode
///
-/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
+/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
///
/// \param a
/// A 256-bit vector of [8 x float]. The four selected elements in this
@@ -1522,22 +1525,23 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// destination, according to the bits specified in the immediate operand.
/// \param mask
/// An immediate value containing an 8-bit value specifying which elements to
-/// copy from a and b. Bits [3:0] specify the values copied from operand a.
-/// Bits [7:4] specify the values copied from operand b.
+/// copy from \a a and \a b \n.
+/// Bits [3:0] specify the values copied from operand \a a. \n
+/// Bits [7:4] specify the values copied from operand \a b. \n
/// The destinations within the 256-bit destination are assigned values as
-/// follows, according to the bit value assignments described below:
+/// follows, according to the bit value assignments described below: \n
/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
-/// destination.
+/// destination. \n
/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
-/// destination.
+/// destination. \n
/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
-/// destination.
+/// destination. \n
/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
-/// the destination.
-/// Bit value assignments:
-/// 00: Bits [31:0] and [159:128] are copied from the selected operand.
-/// 01: Bits [63:32] and [191:160] are copied from the selected operand.
-/// 10: Bits [95:64] and [223:192] are copied from the selected operand.
+/// the destination. \n
+/// Bit value assignments: \n
+/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
+/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
+/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
@@ -1567,7 +1571,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
/// \endcode
///
-/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
+/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
///
/// \param a
/// A 256-bit vector of [4 x double].
@@ -1575,22 +1579,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// A 256-bit vector of [4 x double].
/// \param mask
/// An immediate value containing 8-bit values specifying which elements to
-/// copy from a and b:
-/// Bit [0]=0: Bits [63:0] are copied from a to bits [63:0] of the
-/// destination.
-/// Bit [0]=1: Bits [127:64] are copied from a to bits [63:0] of the
-/// destination.
-/// Bit [1]=0: Bits [63:0] are copied from b to bits [127:64] of the
-/// destination.
-/// Bit [1]=1: Bits [127:64] are copied from b to bits [127:64] of the
-/// destination.
-/// Bit [2]=0: Bits [191:128] are copied from a to bits [191:128] of the
-/// destination.
-/// Bit [2]=1: Bits [255:192] are copied from a to bits [191:128] of the
-/// destination.
-/// Bit [3]=0: Bits [191:128] are copied from b to bits [255:192] of the
-/// destination.
-/// Bit [3]=1: Bits [255:192] are copied from b to bits [255:192] of the
+/// copy from \a a and \a b: \n
+/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
+/// destination. \n
+/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
+/// destination. \n
+/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
+/// destination. \n
+/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
+/// destination. \n
+/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
+/// destination. \n
+/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
+/// destination. \n
+/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
+/// destination. \n
+/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
/// destination.
/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
@@ -1647,7 +1651,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
/// \endcode
///
-/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
///
/// \param a
/// A 128-bit vector of [2 x double].
@@ -1655,16 +1659,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// A 128-bit vector of [2 x double].
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use:
-/// 00h, 08h, 10h, 18h: Equal
-/// 01h, 09h, 11h, 19h: Less than
-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-/// operands)
-/// 03h, 0Bh, 13h, 1Bh: Unordered
-/// 04h, 0Ch, 14h, 1Ch: Not equal
-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+/// operation to use: \n
+/// 00h, 08h, 10h, 18h: Equal \n
+/// 01h, 09h, 11h, 19h: Less than \n
+/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+/// (swapped operands) \n
+/// 03h, 0Bh, 13h, 1Bh: Unordered \n
+/// 04h, 0Ch, 14h, 1Ch: Not equal \n
+/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-/// (swapped operands)
+/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_pd(a, b, c) __extension__ ({ \
@@ -1683,7 +1688,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
/// \endcode
///
-/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
///
/// \param a
/// A 128-bit vector of [4 x float].
@@ -1691,16 +1696,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// A 128-bit vector of [4 x float].
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use:
-/// 00h, 08h, 10h, 18h: Equal
-/// 01h, 09h, 11h, 19h: Less than
-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-/// operands)
-/// 03h, 0Bh, 13h, 1Bh: Unordered
-/// 04h, 0Ch, 14h, 1Ch: Not equal
-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+/// operation to use: \n
+/// 00h, 08h, 10h, 18h: Equal \n
+/// 01h, 09h, 11h, 19h: Less than \n
+/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+/// (swapped operands) \n
+/// 03h, 0Bh, 13h, 1Bh: Unordered \n
+/// 04h, 0Ch, 14h, 1Ch: Not equal \n
+/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-/// (swapped operands)
+/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ps(a, b, c) __extension__ ({ \
@@ -1719,7 +1725,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
/// \endcode
///
-/// This intrinsic corresponds to the \c VCMPPD / CMPPD instruction.
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
///
/// \param a
/// A 256-bit vector of [4 x double].
@@ -1727,16 +1733,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// A 256-bit vector of [4 x double].
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use:
-/// 00h, 08h, 10h, 18h: Equal
-/// 01h, 09h, 11h, 19h: Less than
-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-/// operands)
-/// 03h, 0Bh, 13h, 1Bh: Unordered
-/// 04h, 0Ch, 14h, 1Ch: Not equal
-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+/// operation to use: \n
+/// 00h, 08h, 10h, 18h: Equal \n
+/// 01h, 09h, 11h, 19h: Less than \n
+/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+/// (swapped operands) \n
+/// 03h, 0Bh, 13h, 1Bh: Unordered \n
+/// 04h, 0Ch, 14h, 1Ch: Not equal \n
+/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-/// (swapped operands)
+/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// \returns A 256-bit vector of [4 x double] containing the comparison results.
#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
@@ -1755,7 +1762,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
/// \endcode
///
-/// This intrinsic corresponds to the \c VCMPPS / CMPPS instruction.
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
///
/// \param a
/// A 256-bit vector of [8 x float].
@@ -1763,16 +1770,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// A 256-bit vector of [8 x float].
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use:
-/// 00h, 08h, 10h, 18h: Equal
-/// 01h, 09h, 11h, 19h: Less than
-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-/// operands)
-/// 03h, 0Bh, 13h, 1Bh: Unordered
-/// 04h, 0Ch, 14h, 1Ch: Not equal
-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+/// operation to use: \n
+/// 00h, 08h, 10h, 18h: Equal \n
+/// 01h, 09h, 11h, 19h: Less than \n
+/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+/// (swapped operands) \n
+/// 03h, 0Bh, 13h, 1Bh: Unordered \n
+/// 04h, 0Ch, 14h, 1Ch: Not equal \n
+/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-/// (swapped operands)
+/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// \returns A 256-bit vector of [8 x float] containing the comparison results.
#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
@@ -1790,7 +1798,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
/// \endcode
///
-/// This intrinsic corresponds to the \c VCMPSD / CMPSD instruction.
+/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
///
/// \param a
/// A 128-bit vector of [2 x double].
@@ -1798,16 +1806,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// A 128-bit vector of [2 x double].
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use:
-/// 00h, 08h, 10h, 18h: Equal
-/// 01h, 09h, 11h, 19h: Less than
-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-/// operands)
-/// 03h, 0Bh, 13h, 1Bh: Unordered
-/// 04h, 0Ch, 14h, 1Ch: Not equal
-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+/// operation to use: \n
+/// 00h, 08h, 10h, 18h: Equal \n
+/// 01h, 09h, 11h, 19h: Less than \n
+/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+/// (swapped operands) \n
+/// 03h, 0Bh, 13h, 1Bh: Unordered \n
+/// 04h, 0Ch, 14h, 1Ch: Not equal \n
+/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-/// (swapped operands)
+/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_sd(a, b, c) __extension__ ({ \
@@ -1825,7 +1834,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
/// \endcode
///
-/// This intrinsic corresponds to the \c VCMPSS / CMPSS instruction.
+/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
///
/// \param a
/// A 128-bit vector of [4 x float].
@@ -1833,16 +1842,17 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// A 128-bit vector of [4 x float].
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use:
-/// 00h, 08h, 10h, 18h: Equal
-/// 01h, 09h, 11h, 19h: Less than
-/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal (swapped
-/// operands)
-/// 03h, 0Bh, 13h, 1Bh: Unordered
-/// 04h, 0Ch, 14h, 1Ch: Not equal
-/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than (swapped operands)
+/// operation to use: \n
+/// 00h, 08h, 10h, 18h: Equal \n
+/// 01h, 09h, 11h, 19h: Less than \n
+/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+/// (swapped operands) \n
+/// 03h, 0Bh, 13h, 1Bh: Unordered \n
+/// 04h, 0Ch, 14h, 1Ch: Not equal \n
+/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-/// (swapped operands)
+/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ss(a, b, c) __extension__ ({ \
@@ -1854,8 +1864,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-/// EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 256-bit vector of [8 x i32].
@@ -1876,8 +1886,8 @@ _mm256_extract_epi32(__m256i __a, const int __imm)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-/// EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 256-bit integer vector of [16 x i16].
@@ -1898,8 +1908,8 @@ _mm256_extract_epi16(__m256i __a, const int __imm)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-/// EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 256-bit integer vector of [32 x i8].
@@ -1921,8 +1931,8 @@ _mm256_extract_epi8(__m256i __a, const int __imm)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VEXTRACTF128+COMPOSITE /
-/// EXTRACTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 256-bit integer vector of [4 x i64].
@@ -1945,8 +1955,8 @@ _mm256_extract_epi64(__m256i __a, const int __imm)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-/// INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A vector of [8 x i32] to be used by the insert operation.
@@ -1955,8 +1965,8 @@ _mm256_extract_epi64(__m256i __a, const int __imm)
/// \param __imm
/// An immediate integer specifying the index of the vector element to be
/// replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-/// with __b.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \a __imm with \a __b.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
{
@@ -1972,8 +1982,8 @@ _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-/// INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A vector of [16 x i16] to be used by the insert operation.
@@ -1982,8 +1992,8 @@ _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
/// \param __imm
/// An immediate integer specifying the index of the vector element to be
/// replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-/// with __b.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \a __imm with \a __b.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
{
@@ -1998,8 +2008,8 @@ _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-/// INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A vector of [32 x i8] to be used by the insert operation.
@@ -2008,8 +2018,8 @@ _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
/// \param __imm
/// An immediate integer specifying the index of the vector element to be
/// replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-/// with __b.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \a __imm with \a __b.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
{
@@ -2025,8 +2035,8 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VINSERTF128+COMPOSITE /
-/// INSERTF128+COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A vector of [4 x i64] to be used by the insert operation.
@@ -2035,8 +2045,8 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
/// \param __imm
/// An immediate integer specifying the index of the vector element to be
/// replaced.
-/// \returns A copy of vector __a, after replacing its element indexed by __imm
-/// with __b.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \a __imm with \a __b.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
{
@@ -2051,7 +2061,7 @@ _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector of [4 x i32].
@@ -2066,7 +2076,7 @@ _mm256_cvtepi32_pd(__m128i __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
///
/// \param __a
/// A 256-bit integer vector.
@@ -2082,7 +2092,7 @@ _mm256_cvtepi32_ps(__m256i __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double].
@@ -2097,7 +2107,7 @@ _mm256_cvtpd_ps(__m256d __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float].
@@ -2108,24 +2118,66 @@ _mm256_cvtps_epi32(__m256 __a)
return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
}
+/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
+/// x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_cvtps_pd(__m128 __a)
{
return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
}
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+/// x i32], truncating the result by rounding towards zero when it is
+/// inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
static __inline __m128i __DEFAULT_FN_ATTRS
_mm256_cvttpd_epi32(__m256d __a)
{
return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
}
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+/// x i32]. When a conversion is inexact, the value returned is rounded
+/// according to the rounding control bits in the MXCSR register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
static __inline __m128i __DEFAULT_FN_ATTRS
_mm256_cvtpd_epi32(__m256d __a)
{
return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
}
+/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
+/// truncating the result by rounding towards zero when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_cvttps_epi32(__m256 __a)
{
@@ -2152,18 +2204,73 @@ _mm256_cvtss_f32(__m256 __a)
}
/* Vector replicate */
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
+/// vector of [8 x float] to float values in a 256-bit vector of
+/// [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
+/// the return value. \n
+/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
+/// the return value. \n
+/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
+/// return value. \n
+/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
+/// return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+/// values.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_movehdup_ps(__m256 __a)
{
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
}
+/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
+/// vector of [8 x float] to float values in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
+/// the return value. \n
+/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
+/// the return value. \n
+/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
+/// return value. \n
+/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
+/// return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+/// values.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_moveldup_ps(__m256 __a)
{
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
}
+/// \brief Moves and duplicates double-precision floating point values from a
+/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
+/// vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double]. \n
+/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
+/// return value. \n
+/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
+/// the return value.
+/// \returns A 256-bit vector of [4 x double] containing the moved and
+/// duplicated values.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_movedup_pd(__m256d __a)
{
@@ -2171,24 +2278,98 @@ _mm256_movedup_pd(__m256d __a)
}
/* Unpack and Interleave */
+/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
+/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double]. \n
+/// Bits [127:64] are written to bits [63:0] of the return value. \n
+/// Bits [255:192] are written to bits [191:128] of the return value. \n
+/// \param __b
+/// A 256-bit floating-point vector of [4 x double]. \n
+/// Bits [127:64] are written to bits [127:64] of the return value. \n
+/// Bits [255:192] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_unpackhi_pd(__m256d __a, __m256d __b)
{
return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
}
+/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
+/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double]. \n
+/// Bits [63:0] are written to bits [63:0] of the return value. \n
+/// Bits [191:128] are written to bits [191:128] of the return value.
+/// \param __b
+/// A 256-bit floating-point vector of [4 x double]. \n
+/// Bits [63:0] are written to bits [127:64] of the return value. \n
+/// Bits [191:128] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_unpacklo_pd(__m256d __a, __m256d __b)
{
return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
}
+/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
+/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+/// vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [95:64] are written to bits [31:0] of the return value. \n
+/// Bits [127:96] are written to bits [95:64] of the return value. \n
+/// Bits [223:192] are written to bits [159:128] of the return value. \n
+/// Bits [255:224] are written to bits [223:192] of the return value.
+/// \param __b
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [95:64] are written to bits [63:32] of the return value. \n
+/// Bits [127:96] are written to bits [127:96] of the return value. \n
+/// Bits [223:192] are written to bits [191:160] of the return value. \n
+/// Bits [255:224] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_unpackhi_ps(__m256 __a, __m256 __b)
{
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
}
+/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
+/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+/// vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [31:0] are written to bits [31:0] of the return value. \n
+/// Bits [63:32] are written to bits [95:64] of the return value. \n
+/// Bits [159:128] are written to bits [159:128] of the return value. \n
+/// Bits [191:160] are written to bits [223:192] of the return value.
+/// \param __b
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [31:0] are written to bits [63:32] of the return value. \n
+/// Bits [63:32] are written to bits [127:96] of the return value. \n
+/// Bits [159:128] are written to bits [191:160] of the return value. \n
+/// Bits [191:160] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_unpacklo_ps(__m256 __a, __m256 __b)
{
@@ -2196,90 +2377,401 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b)
}
/* Bit Test */
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+/// element-by-element comparison of the double-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns the ZF flag in the EFLAGS register.
static __inline int __DEFAULT_FN_ATTRS
_mm_testz_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+/// element-by-element comparison of the double-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns the CF flag in the EFLAGS register.
static __inline int __DEFAULT_FN_ATTRS
_mm_testc_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+/// element-by-element comparison of the double-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
static __inline int __DEFAULT_FN_ATTRS
_mm_testnzc_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns the ZF flag.
static __inline int __DEFAULT_FN_ATTRS
_mm_testz_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
}
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns the CF flag.
static __inline int __DEFAULT_FN_ATTRS
_mm_testc_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
}
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
static __inline int __DEFAULT_FN_ATTRS
_mm_testnzc_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
}
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+/// element-by-element comparison of the double-precision elements in the
+/// first source vector and the corresponding elements in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double].
+/// \returns the ZF flag.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testz_pd(__m256d __a, __m256d __b)
{
return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
}
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+/// element-by-element comparison of the double-precision elements in the
+/// first source vector and the corresponding elements in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double].
+/// \returns the CF flag.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testc_pd(__m256d __a, __m256d __b)
{
return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
}
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+/// element-by-element comparison of the double-precision elements in the
+/// first source vector and the corresponding elements in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testnzc_pd(__m256d __a, __m256d __b)
{
return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
}
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float].
+/// \returns the ZF flag.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testz_ps(__m256 __a, __m256 __b)
{
return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
}
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float].
+/// \returns the CF flag.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testc_ps(__m256 __a, __m256 __b)
{
return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
}
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+/// element-by-element comparison of the single-precision elements in the
+/// first source vector and the corresponding elements in the second source
+/// vector. The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testnzc_ps(__m256 __a, __m256 __b)
{
return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
}
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+/// of the two source vectors and update the EFLAGS register as follows: \n
+/// If there is at least one pair of bits where both bits are 1, the ZF flag
+/// is set to 0. Otherwise the ZF flag is set to 1. \n
+/// If there is at least one pair of bits where the bit from the first source
+/// vector is 0 and the bit from the second source vector is 1, the CF flag
+/// is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \param __b
+/// A 256-bit integer vector.
+/// \returns the ZF flag.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testz_si256(__m256i __a, __m256i __b)
{
return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
}
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+/// of the two source vectors and update the EFLAGS register as follows: \n
+/// If there is at least one pair of bits where both bits are 1, the ZF flag
+/// is set to 0. Otherwise the ZF flag is set to 1. \n
+/// If there is at least one pair of bits where the bit from the first source
+/// vector is 0 and the bit from the second source vector is 1, the CF flag
+/// is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \param __b
+/// A 256-bit integer vector.
+/// \returns the CF flag.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testc_si256(__m256i __a, __m256i __b)
{
return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
}
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+/// of the two source vectors and update the EFLAGS register as follows: \n
+/// If there is at least one pair of bits where both bits are 1, the ZF flag
+/// is set to 0. Otherwise the ZF flag is set to 1. \n
+/// If there is at least one pair of bits where the bit from the first source
+/// vector is 0 and the bit from the second source vector is 1, the CF flag
+/// is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \param __b
+/// A 256-bit integer vector.
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
static __inline int __DEFAULT_FN_ATTRS
_mm256_testnzc_si256(__m256i __a, __m256i __b)
{
@@ -2287,12 +2779,36 @@ _mm256_testnzc_si256(__m256i __a, __m256i __b)
}
/* Vector extract sign mask */
+/// \brief Extracts the sign bits of double-precision floating point elements
+/// in a 256-bit vector of [4 x double] and writes them to the lower order
+/// bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the double-precision
+/// floating point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [3:0].
static __inline int __DEFAULT_FN_ATTRS
_mm256_movemask_pd(__m256d __a)
{
return __builtin_ia32_movmskpd256((__v4df)__a);
}
+/// \brief Extracts the sign bits of double-precision floating point elements
+/// in a 256-bit vector of [8 x float] and writes them to the lower order
+/// bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the double-precision floating
+/// point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [7:0].
static __inline int __DEFAULT_FN_ATTRS
_mm256_movemask_ps(__m256 __a)
{
@@ -2300,12 +2816,22 @@ _mm256_movemask_ps(__m256 __a)
}
/* Vector __zero */
+/// \brief Zeroes the contents of all XMM or YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
static __inline void __DEFAULT_FN_ATTRS
_mm256_zeroall(void)
{
__builtin_ia32_vzeroall();
}
+/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
static __inline void __DEFAULT_FN_ATTRS
_mm256_zeroupper(void)
{
@@ -2313,6 +2839,18 @@ _mm256_zeroupper(void)
}
/* Vector load with broadcast */
+/// \brief Loads a scalar single-precision floating point value from the
+/// specified address pointed to by \a __a and broadcasts it to the elements
+/// of a [4 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+/// The single-precision floating point value to be broadcast.
+/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
+/// equal to the broadcast value.
static __inline __m128 __DEFAULT_FN_ATTRS
_mm_broadcast_ss(float const *__a)
{
@@ -2320,6 +2858,18 @@ _mm_broadcast_ss(float const *__a)
return (__m128)(__v4sf){ __f, __f, __f, __f };
}
+/// \brief Loads a scalar double-precision floating point value from the
+/// specified address pointed to by \a __a and broadcasts it to the elements
+/// of a [4 x double] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
+///
+/// \param __a
+/// The double-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
+/// equal to the broadcast value.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_broadcast_sd(double const *__a)
{
@@ -2327,6 +2877,18 @@ _mm256_broadcast_sd(double const *__a)
return (__m256d)(__v4df){ __d, __d, __d, __d };
}
+/// \brief Loads a scalar single-precision floating point value from the
+/// specified address pointed to by \a __a and broadcasts it to the elements
+/// of a [8 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+/// The single-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
+/// equal to the broadcast value.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_broadcast_ss(float const *__a)
{
@@ -2334,12 +2896,36 @@ _mm256_broadcast_ss(float const *__a)
return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
}
+/// \brief Loads the data from a 128-bit vector of [2 x double] from the
+/// specified address pointed to by \a __a and broadcasts it to 128-bit
+/// elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+/// The 128-bit vector of [2 x double] to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
+/// equal to the broadcast value.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_broadcast_pd(__m128d const *__a)
{
return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
}
+/// \brief Loads the data from a 128-bit vector of [4 x float] from the
+/// specified address pointed to by \a __a and broadcasts it to 128-bit
+/// elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+/// The 128-bit vector of [4 x float] to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
+/// equal to the broadcast value.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_broadcast_ps(__m128 const *__a)
{
@@ -2347,18 +2933,50 @@ _mm256_broadcast_ps(__m128 const *__a)
}
/* SIMD load ops */
+/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
+/// memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location containing
+/// double-precision floating point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_load_pd(double const *__p)
{
return *(__m256d *)__p;
}
+/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
+/// memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location containing float values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_load_ps(float const *__p)
{
return *(__m256 *)__p;
}
+/// \brief Loads 4 double-precision floating point values from an unaligned
+/// memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location containing double-precision floating
+/// point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_loadu_pd(double const *__p)
{
@@ -2368,6 +2986,17 @@ _mm256_loadu_pd(double const *__p)
return ((struct __loadu_pd*)__p)->__v;
}
+/// \brief Loads 8 single-precision floating point values from an unaligned
+/// memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location containing single-precision floating
+/// point values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_loadu_ps(float const *__p)
{
@@ -2377,12 +3006,33 @@ _mm256_loadu_ps(float const *__p)
return ((struct __loadu_ps*)__p)->__v;
}
+/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
+/// location pointed to by \a __p into elements of a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
+/// values.
+/// \returns A 256-bit integer vector containing the moved values.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_load_si256(__m256i const *__p)
{
return *__p;
}
+/// \brief Loads 256 bits of integer data from an unaligned memory location
+/// pointed to by \a __p into a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+/// A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_loadu_si256(__m256i const *__p)
{
@@ -2392,6 +3042,18 @@ _mm256_loadu_si256(__m256i const *__p)
return ((struct __loadu_si256*)__p)->__v;
}
+/// \brief Loads 256 bits of integer data from an unaligned memory location
+/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
+/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
+/// line boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+/// A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_lddqu_si256(__m256i const *__p)
{
@@ -2399,18 +3061,55 @@ _mm256_lddqu_si256(__m256i const *__p)
}
/* SIMD store ops */
+/// \brief Stores double-precision floating point values from a 256-bit vector
+/// of [4 x double] to a 32-byte aligned memory location pointed to by
+/// \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location that will receive the
+/// double-precision floaing point values.
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_store_pd(double *__p, __m256d __a)
{
*(__m256d *)__p = __a;
}
+/// \brief Stores single-precision floating point values from a 256-bit vector
+/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location that will receive the
+/// float values.
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_store_ps(float *__p, __m256 __a)
{
*(__m256 *)__p = __a;
}
+/// \brief Stores double-precision floating point values from a 256-bit vector
+/// of [4 x double] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the double-precision
+/// floating point values.
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_pd(double *__p, __m256d __a)
{
@@ -2420,6 +3119,17 @@ _mm256_storeu_pd(double *__p, __m256d __a)
((struct __storeu_pd*)__p)->__v = __a;
}
+/// \brief Stores single-precision floating point values from a 256-bit vector
+/// of [8 x float] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_ps(float *__p, __m256 __a)
{
@@ -2429,12 +3139,35 @@ _mm256_storeu_ps(float *__p, __m256 __a)
((struct __storeu_ps*)__p)->__v = __a;
}
+/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
+/// aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location that will receive the
+/// integer values.
+/// \param __a
+/// A 256-bit integer vector containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_store_si256(__m256i *__p, __m256i __a)
{
*__p = __a;
}
+/// \brief Stores integer values from a 256-bit integer vector to an unaligned
+/// memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the integer values.
+/// \param __a
+/// A 256-bit integer vector containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu_si256(__m256i *__p, __m256i __a)
{
@@ -2445,12 +3178,48 @@ _mm256_storeu_si256(__m256i *__p, __m256i __a)
}
/* Conditional load ops */
+/// \brief Conditionally loads double-precision floating point elements from a
+/// memory location pointed to by \a __p into a 128-bit vector of
+/// [2 x double], depending on the mask bits associated with each data
+/// element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that contains the double-precision
+/// floating point values.
+/// \param __m
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each data element represents the mask bits. If a mask bit is zero, the
+/// corresponding value in the memory location is not loaded and the
+/// corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
static __inline __m128d __DEFAULT_FN_ATTRS
_mm_maskload_pd(double const *__p, __m128i __m)
{
return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
}
+/// \brief Conditionally loads double-precision floating point elements from a
+/// memory location pointed to by \a __p into a 256-bit vector of
+/// [4 x double], depending on the mask bits associated with each data
+/// element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that contains the double-precision
+/// floating point values.
+/// \param __m
+/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
+/// significant bit of each quadword element represents the mask bits. If a
+/// mask bit is zero, the corresponding value in the memory location is not
+/// loaded and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [4 x double] containing the loaded values.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_maskload_pd(double const *__p, __m256i __m)
{
@@ -2458,12 +3227,48 @@ _mm256_maskload_pd(double const *__p, __m256i __m)
(__v4di)__m);
}
+/// \brief Conditionally loads single-precision floating point elements from a
+/// memory location pointed to by \a __p into a 128-bit vector of
+/// [4 x float], depending on the mask bits associated with each data
+/// element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that contains the single-precision
+/// floating point values.
+/// \param __m
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each data element represents the mask bits. If a mask bit is zero, the
+/// corresponding value in the memory location is not loaded and the
+/// corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
static __inline __m128 __DEFAULT_FN_ATTRS
_mm_maskload_ps(float const *__p, __m128i __m)
{
return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
}
+/// \brief Conditionally loads single-precision floating point elements from a
+/// memory location pointed to by \a __p into a 256-bit vector of
+/// [8 x float], depending on the mask bits associated with each data
+/// element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that contains the single-precision
+/// floating point values.
+/// \param __m
+/// A 256-bit integer vector of [8 x dword] containing the mask. The most
+/// significant bit of each dword element represents the mask bits. If a mask
+/// bit is zero, the corresponding value in the memory location is not loaded
+/// and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [8 x float] containing the loaded values.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_maskload_ps(float const *__p, __m256i __m)
{
@@ -2471,24 +3276,96 @@ _mm256_maskload_ps(float const *__p, __m256i __m)
}
/* Conditional store ops */
+/// \brief Moves single-precision floating point values from a 256-bit vector
+/// of [8 x float] to a memory location pointed to by \a __p, according to
+/// the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __m
+/// A 256-bit integer vector of [8 x dword] containing the mask. The most
+/// significant bit of each dword element in the mask vector represents the
+/// mask bits. If a mask bit is zero, the corresponding value from vector
+/// \a __a is not stored and the corresponding field in the memory location
+/// pointed to by \a __p is not changed.
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the values to be stored.
static __inline void __DEFAULT_FN_ATTRS
_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
{
__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
}
+/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
+/// to a memory location pointed to by \a __p, according to the specified
+/// mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __m
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each field in the mask vector represents the mask bits. If a mask bit is
+/// zero, the corresponding value from vector \a __a is not stored and the
+/// corresponding field in the memory location pointed to by \a __p is not
+/// changed.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values to be stored.
static __inline void __DEFAULT_FN_ATTRS
_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
{
__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
}
+/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
+/// to a memory location pointed to by \a __p, according to the specified
+/// mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __m
+/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
+/// significant bit of each quadword element in the mask vector represents
+/// the mask bits. If a mask bit is zero, the corresponding value from vector
+/// __a is not stored and the corresponding field in the memory location
+/// pointed to by \a __p is not changed.
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the values to be stored.
static __inline void __DEFAULT_FN_ATTRS
_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
{
__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
}
+/// \brief Moves single-precision floating point values from a 128-bit vector
+/// of [4 x float] to a memory location pointed to by \a __p, according to
+/// the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __m
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each field in the mask vector represents the mask bits. If a mask bit is
+/// zero, the corresponding value from vector __a is not stored and the
+/// corresponding field in the memory location pointed to by \a __p is not
+/// changed.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the values to be stored.
static __inline void __DEFAULT_FN_ATTRS
_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
{
@@ -2496,18 +3373,58 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
}
/* Cacheability support ops */
+/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
+/// aligned memory location. To minimize caching, the data is flagged as
+/// non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
+///
+/// \param __a
+/// A pointer to a 32-byte aligned memory location that will receive the
+/// integer values.
+/// \param __b
+/// A 256-bit integer vector containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_si256(__m256i *__a, __m256i __b)
{
__builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
}
+/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
+/// to a 32-byte aligned memory location. To minimize caching, the data is
+/// flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
+///
+/// \param __a
+/// A pointer to a 32-byte aligned memory location that will receive the
+/// integer values.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_pd(double *__a, __m256d __b)
{
__builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
}
+/// \brief Moves single-precision floating point values from a 256-bit vector
+/// of [8 x float] to a 32-byte aligned memory location. To minimize
+/// caching, the data is flagged as non-temporal (unlikely to be used again
+/// soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 32-byte aligned memory location that will receive the
+/// single-precision floating point values.
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_ps(float *__p, __m256 __a)
{
@@ -2515,30 +3432,105 @@ _mm256_stream_ps(float *__p, __m256 __a)
}
/* Create vectors */
+/// \brief Create a 256-bit vector of [4 x double] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [4 x double] containing undefined values.
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_undefined_pd(void)
{
return (__m256d)__builtin_ia32_undef256();
}
+/// \brief Create a 256-bit vector of [8 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [8 x float] containing undefined values.
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_undefined_ps(void)
{
return (__m256)__builtin_ia32_undef256();
}
+/// \brief Create a 256-bit integer vector with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit integer vector containing undefined values.
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_undefined_si256(void)
{
return (__m256i)__builtin_ia32_undef256();
}
+/// \brief Constructs a 256-bit floating-point vector of [4 x double]
+/// initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __a
+/// A double-precision floating-point value used to initialize bits [255:192]
+/// of the result.
+/// \param __b
+/// A double-precision floating-point value used to initialize bits [191:128]
+/// of the result.
+/// \param __c
+/// A double-precision floating-point value used to initialize bits [127:64]
+/// of the result.
+/// \param __d
+/// A double-precision floating-point value used to initialize bits [63:0]
+/// of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_set_pd(double __a, double __b, double __c, double __d)
{
return (__m256d){ __d, __c, __b, __a };
}
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
+/// with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __a
+/// A single-precision floating-point value used to initialize bits [255:224]
+/// of the result.
+/// \param __b
+/// A single-precision floating-point value used to initialize bits [223:192]
+/// of the result.
+/// \param __c
+/// A single-precision floating-point value used to initialize bits [191:160]
+/// of the result.
+/// \param __d
+/// A single-precision floating-point value used to initialize bits [159:128]
+/// of the result.
+/// \param __e
+/// A single-precision floating-point value used to initialize bits [127:96]
+/// of the result.
+/// \param __f
+/// A single-precision floating-point value used to initialize bits [95:64]
+/// of the result.
+/// \param __g
+/// A single-precision floating-point value used to initialize bits [63:32]
+/// of the result.
+/// \param __h
+/// A single-precision floating-point value used to initialize bits [31:0]
+/// of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_set_ps(float __a, float __b, float __c, float __d,
float __e, float __f, float __g, float __h)
@@ -2546,6 +3538,31 @@ _mm256_set_ps(float __a, float __b, float __c, float __d,
return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
}
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+/// 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i0
+/// A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \param __i1
+/// A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i2
+/// A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i3
+/// A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i4
+/// A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i5
+/// A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i6
+/// A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i7
+/// A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \returns An initialized 256-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
int __i4, int __i5, int __i6, int __i7)
@@ -2553,6 +3570,47 @@ _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
}
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+/// 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w15
+/// A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \param __w14
+/// A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w13
+/// A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w12
+/// A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w11
+/// A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w10
+/// A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w09
+/// A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w08
+/// A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w07
+/// A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w06
+/// A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w05
+/// A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w04
+/// A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w03
+/// A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w02
+/// A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w01
+/// A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w00
+/// A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \returns An initialized 256-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
short __w11, short __w10, short __w09, short __w08,
@@ -2563,6 +3621,79 @@ _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
}
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+/// 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b31
+/// An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \param __b30
+/// An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b29
+/// An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b28
+/// An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b27
+/// An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b26
+/// An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b25
+/// An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b24
+/// An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b23
+/// An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b22
+/// An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b21
+/// An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b20
+/// An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b19
+/// An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b18
+/// An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b17
+/// An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b16
+/// An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b15
+/// An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b14
+/// An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b13
+/// An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b12
+/// An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b11
+/// An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b10
+/// An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b09
+/// An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b08
+/// An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b07
+/// An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b06
+/// An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b05
+/// An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b04
+/// An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b03
+/// An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b02
+/// An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b01
+/// An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b00
+/// An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \returns An initialized 256-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
char __b27, char __b26, char __b25, char __b24,
@@ -2581,6 +3712,23 @@ _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
};
}
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+/// 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __a
+/// A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \param __b
+/// A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __c
+/// A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __d
+/// A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \returns An initialized 256-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
{
@@ -2588,12 +3736,68 @@ _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
}
/* Create vectors with elements in reverse order */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double],
+/// initialized in reverse order with the specified double-precision
+/// floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __a
+/// A double-precision floating-point value used to initialize bits [63:0]
+/// of the result.
+/// \param __b
+/// A double-precision floating-point value used to initialize bits [127:64]
+/// of the result.
+/// \param __c
+/// A double-precision floating-point value used to initialize bits [191:128]
+/// of the result.
+/// \param __d
+/// A double-precision floating-point value used to initialize bits [255:192]
+/// of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_setr_pd(double __a, double __b, double __c, double __d)
{
return (__m256d){ __a, __b, __c, __d };
}
+/// \brief Constructs a 256-bit floating-point vector of [8 x float],
+/// initialized in reverse order with the specified single-precision
+/// float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __a
+/// A single-precision floating-point value used to initialize bits [31:0]
+/// of the result.
+/// \param __b
+/// A single-precision floating-point value used to initialize bits [63:32]
+/// of the result.
+/// \param __c
+/// A single-precision floating-point value used to initialize bits [95:64]
+/// of the result.
+/// \param __d
+/// A single-precision floating-point value used to initialize bits [127:96]
+/// of the result.
+/// \param __e
+/// A single-precision floating-point value used to initialize bits [159:128]
+/// of the result.
+/// \param __f
+/// A single-precision floating-point value used to initialize bits [191:160]
+/// of the result.
+/// \param __g
+/// A single-precision floating-point value used to initialize bits [223:192]
+/// of the result.
+/// \param __h
+/// A single-precision floating-point value used to initialize bits [255:224]
+/// of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_setr_ps(float __a, float __b, float __c, float __d,
float __e, float __f, float __g, float __h)
@@ -2601,6 +3805,31 @@ _mm256_setr_ps(float __a, float __b, float __c, float __d,
return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
}
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+/// with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i0
+/// A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+/// A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+/// A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+/// A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i4
+/// A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i5
+/// A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i6
+/// A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i7
+/// A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \returns An initialized 256-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
int __i4, int __i5, int __i6, int __i7)
@@ -2608,6 +3837,47 @@ _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
}
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+/// with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w15
+/// A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w14
+/// A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w13
+/// A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w12
+/// A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w11
+/// A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w10
+/// A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w09
+/// A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w08
+/// A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w07
+/// A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w06
+/// A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w05
+/// A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w04
+/// A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w03
+/// A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w02
+/// A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w01
+/// A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w00
+/// A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \returns An initialized 256-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
short __w11, short __w10, short __w09, short __w08,
@@ -2618,6 +3888,79 @@ _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
}
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+/// with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b31
+/// An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b30
+/// An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b29
+/// An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b28
+/// An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b27
+/// An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b26
+/// An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b25
+/// An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b24
+/// An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b23
+/// An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b22
+/// An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b21
+/// An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b20
+/// An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b19
+/// An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b18
+/// An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b17
+/// An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b16
+/// An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b15
+/// An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b14
+/// An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b13
+/// An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b12
+/// An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b11
+/// An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b10
+/// An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b09
+/// An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b08
+/// An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b07
+/// An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b06
+/// An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b05
+/// An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b04
+/// An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b03
+/// An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b02
+/// An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b01
+/// An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b00
+/// An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \returns An initialized 256-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
char __b27, char __b26, char __b25, char __b24,
@@ -2635,6 +3978,23 @@ _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
}
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+/// with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __a
+/// A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \param __b
+/// A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __c
+/// A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __d
+/// A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \returns An initialized 256-bit integer vector.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
{
@@ -2642,24 +4002,74 @@ _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
}
/* Create vectors with repeated elements */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
+/// of the four double-precision floating-point vector elements set to the
+/// specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_set1_pd(double __w)
{
return (__m256d){ __w, __w, __w, __w };
}
+/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
+/// of the eight single-precision floating-point vector elements set to the
+/// specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __w
+/// A single-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_set1_ps(float __w)
{
return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
}
+/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
+/// 32-bit integral vector elements set to the specified 32-bit integral
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __i
+/// A 32-bit integral value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 256-bit integer vector of [8 x i32].
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set1_epi32(int __i)
{
return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
}
+/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
+/// 16-bit integral vector elements set to the specified 16-bit integral
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __w
+/// A 16-bit integral value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 256-bit integer vector of [16 x i16].
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set1_epi16(short __w)
{
@@ -2667,6 +4077,17 @@ _mm256_set1_epi16(short __w)
__w, __w, __w, __w, __w, __w };
}
+/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
+/// 8-bit integral vector elements set to the specified 8-bit integral value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __b
+/// An 8-bit integral value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 256-bit integer vector of [32 x i8].
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set1_epi8(char __b)
{
@@ -2675,6 +4096,18 @@ _mm256_set1_epi8(char __b)
__b, __b, __b, __b, __b, __b, __b };
}
+/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
+/// 64-bit integral vector elements set to the specified 64-bit integral
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __q
+/// A 64-bit integral value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 256-bit integer vector of [4 x i64].
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_set1_epi64x(long long __q)
{
@@ -2682,18 +4115,41 @@ _mm256_set1_epi64x(long long __q)
}
/* Create __zeroed vectors */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
+/// vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_setzero_pd(void)
{
return (__m256d){ 0, 0, 0, 0 };
}
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
+/// vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_setzero_ps(void)
{
return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
}
+/// \brief Constructs a 256-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit integer vector initialized to zero.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setzero_si256(void)
{
@@ -2701,72 +4157,210 @@ _mm256_setzero_si256(void)
}
/* Cast between vector types */
+/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+/// floating-point vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+/// bitwise pattern as the parameter.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_castpd_ps(__m256d __a)
{
return (__m256)__a;
}
+/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+/// parameter.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_castpd_si256(__m256d __a)
{
return (__m256i)__a;
}
+/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+/// floating-point vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+/// bitwise pattern as the parameter.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_castps_pd(__m256 __a)
{
return (__m256d)__a;
}
+/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+/// parameter.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_castps_si256(__m256 __a)
{
return (__m256i)__a;
}
+/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
+/// of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+/// bitwise pattern as the parameter.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_castsi256_ps(__m256i __a)
{
return (__m256)__a;
}
+/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
+/// of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+/// bitwise pattern as the parameter.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_castsi256_pd(__m256i __a)
{
return (__m256d)__a;
}
+/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
+/// [4 x double] as a 128-bit floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the
+/// lower 128 bits of the parameter.
static __inline __m128d __DEFAULT_FN_ATTRS
_mm256_castpd256_pd128(__m256d __a)
{
return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
}
+/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
+/// [8 x float] as a 128-bit floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [8 x float].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the
+/// lower 128 bits of the parameter.
static __inline __m128 __DEFAULT_FN_ATTRS
_mm256_castps256_ps128(__m256 __a)
{
return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
}
+/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \returns A 128-bit integer vector containing the lower 128 bits of the
+/// parameter.
static __inline __m128i __DEFAULT_FN_ATTRS
_mm256_castsi256_si128(__m256i __a)
{
return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
}
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
+/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
+/// contain the value of the source vector. The contents of the upper 128
+/// bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+/// contain the value of the parameter. The contents of the upper 128 bits
+/// are undefined.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_castpd128_pd256(__m128d __a)
{
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
}
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
+/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+/// the value of the source vector. The contents of the upper 128 bits are
+/// undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+/// contain the value of the parameter. The contents of the upper 128 bits
+/// are undefined.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_castps128_ps256(__m128 __a)
{
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
}
+/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The contents
+/// of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+/// the parameter. The contents of the upper 128 bits are undefined.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_castsi128_si256(__m128i __a)
{
@@ -2778,6 +4372,38 @@ _mm256_castsi128_si256(__m128i __a)
We use macros rather than inlines because we only want to accept
invocations where the immediate M is a constant expression.
*/
+/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
+/// a 256-bit vector of [8 x float] given in the first parameter, and then
+/// replacing either the upper or the lower 128 bits with the contents of a
+/// 128-bit vector of [4 x float] in the second parameter. The immediate
+/// integer parameter determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit vector of [8 x float]. This vector is copied to the result
+/// first, and then either the upper or the lower 128 bits of the result will
+/// be replaced by the contents of \a V2.
+/// \param V2
+/// A 128-bit vector of [4 x float]. The contents of this parameter are
+/// written to either the upper or the lower 128 bits of the result depending
+/// on the value of parameter \a M.
+/// \param M
+/// An immediate integer. The least significant bit determines how the values
+/// from the two parameters are interleaved: \n
+/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
+/// result. \n
+/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+/// result.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
(__m256)__builtin_shufflevector( \
(__v8sf)(__m256)(V1), \
@@ -2791,6 +4417,38 @@ _mm256_castsi128_si256(__m128i __a)
(((M) & 1) ? 10 : 6), \
(((M) & 1) ? 11 : 7) );})
+/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
+/// a 256-bit vector of [4 x double] given in the first parameter, and then
+/// replacing either the upper or the lower 128 bits with the contents of a
+/// 128-bit vector of [2 x double] in the second parameter. The immediate
+/// integer parameter determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit vector of [4 x double]. This vector is copied to the result
+/// first, and then either the upper or the lower 128 bits of the result will
+/// be replaced by the contents of \a V2.
+/// \param V2
+/// A 128-bit vector of [2 x double]. The contents of this parameter are
+/// written to either the upper or the lower 128 bits of the result depending
+/// on the value of parameter \a M.
+/// \param M
+/// An immediate integer. The least significant bit determines how the values
+/// from the two parameters are interleaved: \n
+/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
+/// result. \n
+/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+/// result.
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
(__m256d)__builtin_shufflevector( \
(__v4df)(__m256d)(V1), \
@@ -2800,6 +4458,38 @@ _mm256_castsi128_si256(__m128i __a)
(((M) & 1) ? 4 : 2), \
(((M) & 1) ? 5 : 3) );})
+/// \brief Constructs a new 256-bit integer vector by first duplicating a
+/// 256-bit integer vector given in the first parameter, and then replacing
+/// either the upper or the lower 128 bits with the contents of a 128-bit
+/// integer vector in the second parameter. The immediate integer parameter
+/// determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit integer vector. This vector is copied to the result first, and
+/// then either the upper or the lower 128 bits of the result will be
+/// replaced by the contents of \a V2.
+/// \param V2
+/// A 128-bit integer vector. The contents of this parameter are written to
+/// either the upper or the lower 128 bits of the result depending on the
+/// value of parameter \a M.
+/// \param M
+/// An immediate integer. The least significant bit determines how the values
+/// from the two parameters are interleaved: \n
+/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
+/// result. \n
+/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+/// result.
+/// \returns A 256-bit integer vector containing the interleaved values.
#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
(__m256i)__builtin_shufflevector( \
(__v4di)(__m256i)(V1), \
@@ -2814,6 +4504,27 @@ _mm256_castsi128_si256(__m128i __a)
We use macros rather than inlines because we only want to accept
invocations where the immediate M is a constant expression.
*/
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
+/// of [8 x float], as determined by the immediate integer parameter, and
+/// returns the extracted bits as a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [8 x float].
+/// \param M
+/// An immediate integer. The least significant bit determines which bits are
+/// extracted from the first parameter: \n
+/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+/// result. \n
+/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
#define _mm256_extractf128_ps(V, M) __extension__ ({ \
(__m128)__builtin_shufflevector( \
(__v8sf)(__m256)(V), \
@@ -2823,6 +4534,27 @@ _mm256_castsi128_si256(__m128i __a)
(((M) & 1) ? 6 : 2), \
(((M) & 1) ? 7 : 3) );})
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
+/// of [4 x double], as determined by the immediate integer parameter, and
+/// returns the extracted bits as a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [4 x double].
+/// \param M
+/// An immediate integer. The least significant bit determines which bits are
+/// extracted from the first parameter: \n
+/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+/// result. \n
+/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
#define _mm256_extractf128_pd(V, M) __extension__ ({ \
(__m128d)__builtin_shufflevector( \
(__v4df)(__m256d)(V), \
@@ -2830,6 +4562,27 @@ _mm256_castsi128_si256(__m128i __a)
(((M) & 1) ? 2 : 0), \
(((M) & 1) ? 3 : 1) );})
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
+/// integer vector, as determined by the immediate integer parameter, and
+/// returns the extracted bits as a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+/// A 256-bit integer vector.
+/// \param M
+/// An immediate integer. The least significant bit determines which bits are
+/// extracted from the first parameter: \n
+/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+/// result. \n
+/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit integer vector containing the extracted bits.
#define _mm256_extractf128_si256(V, M) __extension__ ({ \
(__m128i)__builtin_shufflevector( \
(__v4di)(__m256i)(V), \
@@ -2838,6 +4591,27 @@ _mm256_castsi128_si256(__m128i __a)
(((M) & 1) ? 3 : 1) );})
/* SIMD load ops (unaligned) */
+/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
+/// unaligned memory locations and constructs a 256-bit floating-point vector
+/// of [8 x float] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+/// <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location containing 4 consecutive
+/// single-precision floating-point values. These values are to be copied to
+/// bits[255:128] of the result. The address of the memory location does not
+/// have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location containing 4 consecutive
+/// single-precision floating-point values. These values are to be copied to
+/// bits[127:0] of the result. The address of the memory location does not
+/// have to be aligned.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+/// concatenated result.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
{
@@ -2845,6 +4619,27 @@ _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
}
+/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
+/// unaligned memory locations and constructs a 256-bit floating-point vector
+/// of [4 x double] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+/// <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location containing two consecutive
+/// double-precision floating-point values. These values are to be copied to
+/// bits[255:128] of the result. The address of the memory location does not
+/// have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location containing two consecutive
+/// double-precision floating-point values. These values are to be copied to
+/// bits[127:0] of the result. The address of the memory location does not
+/// have to be aligned.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+/// concatenated result.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
{
@@ -2852,6 +4647,24 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
}
+/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
+/// constructs a 256-bit integer vector by concatenating the two 128-bit
+/// vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+/// <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location containing a 128-bit integer
+/// vector. This vector is to be copied to bits[255:128] of the result. The
+/// address of the memory location does not have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location containing a 128-bit integer
+/// vector. This vector is to be copied to bits[127:0] of the result. The
+/// address of the memory location does not have to be aligned.
+/// \returns A 256-bit integer vector containing the concatenated result.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
{
@@ -2860,6 +4673,24 @@ _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
}
/* SIMD store ops (unaligned) */
+/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
+/// vector of [8 x float] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+/// store instructions.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __a
+/// A 256-bit floating-point vector of [8 x float].
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
{
@@ -2871,6 +4702,24 @@ _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
_mm_storeu_ps(__addr_hi, __v128);
}
+/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
+/// vector of [4 x double] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+/// store instructions.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double].
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
{
@@ -2882,6 +4731,24 @@ _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
_mm_storeu_pd(__addr_hi, __v128);
}
+/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
+/// two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+/// store instructions.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __a
+/// A 256-bit integer vector.
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
{
@@ -2893,33 +4760,132 @@ _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
_mm_storeu_si128(__addr_hi, __v128);
}
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
+/// concatenating two 128-bit floating-point vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
+/// 128 bits of the result.
+/// \param __lo
+/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
+/// 128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+/// concatenated result.
static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set_m128 (__m128 __hi, __m128 __lo) {
+_mm256_set_m128 (__m128 __hi, __m128 __lo)
+{
return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
}
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
+/// concatenating two 128-bit floating-point vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
+/// 128 bits of the result.
+/// \param __lo
+/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
+/// 128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+/// concatenated result.
static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set_m128d (__m128d __hi, __m128d __lo) {
+_mm256_set_m128d (__m128d __hi, __m128d __lo)
+{
return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
}
+/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
+/// integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+/// A 128-bit integer vector to be copied to the upper 128 bits of the
+/// result.
+/// \param __lo
+/// A 128-bit integer vector to be copied to the lower 128 bits of the
+/// result.
+/// \returns A 256-bit integer vector containing the concatenated result.
static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_m128i (__m128i __hi, __m128i __lo) {
+_mm256_set_m128i (__m128i __hi, __m128i __lo)
+{
return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
}
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
+/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
+/// similar to _mm256_set_m128, but the order of the input parameters is
+/// swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
+/// 128 bits of the result.
+/// \param __hi
+/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
+/// 128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+/// concatenated result.
static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
+_mm256_setr_m128 (__m128 __lo, __m128 __hi)
+{
return _mm256_set_m128(__hi, __lo);
}
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
+/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
+/// similar to _mm256_set_m128d, but the order of the input parameters is
+/// swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
+/// 128 bits of the result.
+/// \param __hi
+/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
+/// 128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+/// concatenated result.
static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
+_mm256_setr_m128d (__m128d __lo, __m128d __hi)
+{
return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
}
+/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
+/// integer vectors. This is similar to _mm256_set_m128i, but the order of
+/// the input parameters is swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+/// A 128-bit integer vector to be copied to the lower 128 bits of the
+/// result.
+/// \param __hi
+/// A 128-bit integer vector to be copied to the upper 128 bits of the
+/// result.
+/// \returns A 256-bit integer vector containing the concatenated result.
static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
+_mm256_setr_m128i (__m128i __lo, __m128i __hi)
+{
return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
}
diff --git a/lib/Headers/bmiintrin.h b/lib/Headers/bmiintrin.h
index 30acfaeb9f3b..488eb2dbd3d4 100644
--- a/lib/Headers/bmiintrin.h
+++ b/lib/Headers/bmiintrin.h
@@ -36,7 +36,7 @@
/// unsigned short _tzcnt_u16(unsigned short a);
/// \endcode
///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
@@ -53,7 +53,7 @@
/// unsigned int _andn_u32(unsigned int a, unsigned int b);
/// \endcode
///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param a
/// An unsigned integer containing one of the operands.
@@ -73,7 +73,7 @@
/// unsigned int _blsi_u32(unsigned int a);
/// \endcode
///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param a
/// An unsigned integer whose bits are to be cleared.
@@ -91,7 +91,7 @@
/// unsigned int _blsmsk_u32(unsigned int a);
/// \endcode
///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param a
/// An unsigned integer used to create the mask.
@@ -107,7 +107,7 @@
/// unsigned int _blsr_u32(unsigned int a);
/// \endcode
///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param a
/// An unsigned integer containing the operand to be cleared.
@@ -123,7 +123,7 @@
/// unsigned int _tzcnt_u32(unsigned int a);
/// \endcode
///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
@@ -143,7 +143,7 @@
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
@@ -160,7 +160,7 @@ __tzcnt_u16(unsigned short __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param __X
/// An unsigned integer containing one of the operands.
@@ -180,7 +180,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be extracted.
@@ -202,7 +202,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be extracted.
@@ -225,7 +225,7 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be cleared.
@@ -243,7 +243,7 @@ __blsi_u32(unsigned int __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param __X
/// An unsigned integer used to create the mask.
@@ -259,7 +259,7 @@ __blsmsk_u32(unsigned int __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param __X
/// An unsigned integer containing the operand to be cleared.
@@ -275,7 +275,7 @@ __blsr_u32(unsigned int __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
@@ -291,12 +291,12 @@ __tzcnt_u32(unsigned int __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero
-/// bits in the operand.
+/// \returns An 32-bit integer containing the number of trailing zero bits in
+/// the operand.
static __inline__ int __RELAXED_FN_ATTRS
_mm_tzcnt_32(unsigned int __X)
{
@@ -314,7 +314,7 @@ _mm_tzcnt_32(unsigned int __X)
/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
/// \endcode
///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer containing one of the operands.
@@ -334,7 +334,7 @@ _mm_tzcnt_32(unsigned int __X)
/// unsigned long long _blsi_u64(unsigned long long a);
/// \endcode
///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer whose bits are to be cleared.
@@ -352,7 +352,7 @@ _mm_tzcnt_32(unsigned int __X)
/// unsigned long long _blsmsk_u64(unsigned long long a);
/// \endcode
///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer used to create the mask.
@@ -368,7 +368,7 @@ _mm_tzcnt_32(unsigned int __X)
/// unsigned long long _blsr_u64(unsigned long long a);
/// \endcode
///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer containing the operand to be cleared.
@@ -384,7 +384,7 @@ _mm_tzcnt_32(unsigned int __X)
/// unsigned long long _tzcnt_u64(unsigned long long a);
/// \endcode
///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
@@ -397,7 +397,7 @@ _mm_tzcnt_32(unsigned int __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c ANDN instruction.
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer containing one of the operands.
@@ -417,7 +417,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be extracted.
@@ -439,7 +439,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BEXTR instruction.
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be extracted.
@@ -462,7 +462,7 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BLSI instruction.
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be cleared.
@@ -480,7 +480,7 @@ __blsi_u64(unsigned long long __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BLSMSK instruction.
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer used to create the mask.
@@ -496,7 +496,7 @@ __blsmsk_u64(unsigned long long __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c BLSR instruction.
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer containing the operand to be cleared.
@@ -512,7 +512,7 @@ __blsr_u64(unsigned long long __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
@@ -528,12 +528,12 @@ __tzcnt_u64(unsigned long long __X)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c TZCNT instruction.
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An 64-bit integer containing the number of trailing zero
-/// bits in the operand.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+/// the operand.
static __inline__ long long __RELAXED_FN_ATTRS
_mm_tzcnt_64(unsigned long long __X)
{
diff --git a/lib/Headers/cuda_wrappers/algorithm b/lib/Headers/cuda_wrappers/algorithm
new file mode 100644
index 000000000000..95d9beb73c68
--- /dev/null
+++ b/lib/Headers/cuda_wrappers/algorithm
@@ -0,0 +1,96 @@
+/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
+#define __CLANG_CUDA_WRAPPERS_ALGORITHM
+
+// This header defines __device__ overloads of std::min/max, but only if we're
+// <= C++11. In C++14, these functions are constexpr, and so are implicitly
+// __host__ __device__.
+//
+// We don't support the initializer_list overloads because
+// initializer_list::begin() and end() are not __host__ __device__ functions.
+//
+// When compiling in C++14 mode, we could force std::min/max to have different
+// implementations for host and device, by declaring the device overloads
+// before the constexpr overloads appear. We choose not to do this because
+
+// a) why write our own implementation when we can use one from the standard
+// library? and
+// b) libstdc++ is evil and declares min/max inside a header that is included
+// *before* we include <algorithm>. So we'd have to unconditionally
+// declare our __device__ overloads of min/max, but that would pollute
+// things for people who choose not to include <algorithm>.
+
+#include_next <algorithm>
+
+#if __cplusplus <= 201103L
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>). Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b, __Cmp __cmp) {
+ return __cmp(__a, __b) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b) {
+ return __a < __b ? __b : __a;
+}
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b, __Cmp __cmp) {
+ return __cmp(__b, __a) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b) {
+ return __a < __b ? __b : __a;
+}
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif // __cplusplus <= 201103L
+#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
diff --git a/lib/Headers/cuda_wrappers/complex b/lib/Headers/cuda_wrappers/complex
new file mode 100644
index 000000000000..11d40a82a8f6
--- /dev/null
+++ b/lib/Headers/cuda_wrappers/complex
@@ -0,0 +1,82 @@
+/*===---- complex - CUDA wrapper for <complex> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
+#define __CLANG_CUDA_WRAPPERS_COMPLEX
+
+// Wrapper around <complex> that forces its functions to be __host__
+// __device__.
+
+// First, include host-only headers we think are likely to be included by
+// <complex>, so that the pragma below only applies to <complex> itself.
+#if __cplusplus >= 201103L
+#include <type_traits>
+#endif
+#include <stdexcept>
+#include <cmath>
+#include <sstream>
+
+// Next, include our <algorithm> wrapper, to ensure that device overloads of
+// std::min/max are available.
+#include <algorithm>
+
+#pragma clang force_cuda_host_device begin
+
+// When compiling for device, ask libstdc++ to use its own implements of
+// complex functions, rather than calling builtins (which resolve to library
+// functions that don't exist when compiling CUDA device code).
+//
+// This is a little dicey, because it causes libstdc++ to define a different
+// set of overloads on host and device.
+//
+// // Present only when compiling for host.
+// __host__ __device__ void complex<float> sin(const complex<float>& x) {
+// return __builtin_csinf(x);
+// }
+//
+// // Present when compiling for host and for device.
+// template <typename T>
+// void __host__ __device__ complex<T> sin(const complex<T>& x) {
+// return complex<T>(sin(x.real()) * cosh(x.imag()),
+// cos(x.real()), sinh(x.imag()));
+// }
+//
+// This is safe because when compiling for device, all function calls in
+// __host__ code to sin() will still resolve to *something*, even if they don't
+// resolve to the same function as they resolve to when compiling for host. We
+// don't care that they don't resolve to the right function because we won't
+// codegen this host code when compiling for device.
+
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#define _GLIBCXX_USE_C99_COMPLEX 0
+#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
+
+#include_next <complex>
+
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
+
+#pragma clang force_cuda_host_device end
+
+#endif // include guard
diff --git a/lib/Headers/cuda_wrappers/new b/lib/Headers/cuda_wrappers/new
new file mode 100644
index 000000000000..b77131af0e5b
--- /dev/null
+++ b/lib/Headers/cuda_wrappers/new
@@ -0,0 +1,47 @@
+/*===---- complex - CUDA wrapper for <new> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_NEW
+#define __CLANG_CUDA_WRAPPERS_NEW
+
+#include_next <new>
+
+// Device overrides for placement new and delete.
+#pragma push_macro("CUDA_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define CUDA_NOEXCEPT noexcept
+#else
+#define CUDA_NOEXCEPT
+#endif
+
+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+ return __ptr;
+}
+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+ return __ptr;
+}
+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+#pragma pop_macro("CUDA_NOEXCEPT")
+
+#endif // include guard
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index 70d6d726110a..1512f9f0b47b 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h
@@ -49,6 +49,21 @@ typedef signed char __v16qs __attribute__((__vector_size__(16)));
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+/// \brief Adds lower double-precision values in both operands and returns the
+/// sum in the lower 64 bits of the result. The upper 64 bits of the result
+/// are copied from the upper double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
+/// from the upper 64 bits of the first source operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_sd(__m128d __a, __m128d __b)
{
@@ -56,12 +71,41 @@ _mm_add_sd(__m128d __a, __m128d __b)
return __a;
}
+/// \brief Adds two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the sums of both
+/// operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_pd(__m128d __a, __m128d __b)
{
return (__m128d)((__v2df)__a + (__v2df)__b);
}
+/// \brief Subtracts the lower double-precision value of the second operand
+/// from the lower double-precision value of the first operand and returns
+/// the difference in the lower 64 bits of the result. The upper 64 bits of
+/// the result are copied from the upper double-precision value of the first
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// difference of the lower 64 bits of both operands. The upper 64 bits are
+/// copied from the upper 64 bits of the first source operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_sd(__m128d __a, __m128d __b)
{
@@ -69,12 +113,40 @@ _mm_sub_sd(__m128d __a, __m128d __b)
return __a;
}
+/// \brief Subtracts two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the differences between
+/// both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_pd(__m128d __a, __m128d __b)
{
return (__m128d)((__v2df)__a - (__v2df)__b);
}
+/// \brief Multiplies lower double-precision values in both operands and returns
+/// the product in the lower 64 bits of the result. The upper 64 bits of the
+/// result are copied from the upper double-precision value of the first
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// product of the lower 64 bits of both operands. The upper 64 bits are
+/// copied from the upper 64 bits of the first source operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_sd(__m128d __a, __m128d __b)
{
@@ -82,12 +154,41 @@ _mm_mul_sd(__m128d __a, __m128d __b)
return __a;
}
+/// \brief Multiplies two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the products of both
+/// operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_pd(__m128d __a, __m128d __b)
{
return (__m128d)((__v2df)__a * (__v2df)__b);
}
+/// \brief Divides the lower double-precision value of the first operand by the
+/// lower double-precision value of the second operand and returns the
+/// quotient in the lower 64 bits of the result. The upper 64 bits of the
+/// result are copied from the upper double-precision value of the first
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing divisor.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// quotient of the lower 64 bits of both operands. The upper 64 bits are
+/// copied from the upper 64 bits of the first source operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_sd(__m128d __a, __m128d __b)
{
@@ -95,12 +196,44 @@ _mm_div_sd(__m128d __a, __m128d __b)
return __a;
}
+/// \brief Performs an element-by-element division of two 128-bit vectors of
+/// [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the divisor.
+/// \returns A 128-bit vector of [2 x double] containing the quotients of both
+/// operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_pd(__m128d __a, __m128d __b)
{
return (__m128d)((__v2df)__a / (__v2df)__b);
}
+/// \brief Calculates the square root of the lower double-precision value of
+/// the second operand and returns it in the lower 64 bits of the result.
+/// The upper 64 bits of the result are copied from the upper double-
+/// precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// upper 64 bits of this operand are copied to the upper 64 bits of the
+/// result.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// square root is calculated using the lower 64 bits of this operand.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// square root of the lower 64 bits of operand \a __b, and whose upper 64
+/// bits are copied from the upper 64 bits of operand \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_sd(__m128d __a, __m128d __b)
{
@@ -108,150 +241,518 @@ _mm_sqrt_sd(__m128d __a, __m128d __b)
return (__m128d) { __c[0], __a[1] };
}
+/// \brief Calculates the square root of the each of two values stored in a
+/// 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the square roots of the
+/// values in the operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_pd(__m128d __a)
{
return __builtin_ia32_sqrtpd((__v2df)__a);
}
+/// \brief Compares lower 64-bit double-precision values of both operands, and
+/// returns the lesser of the pair of values in the lower 64-bits of the
+/// result. The upper 64 bits of the result are copied from the upper double-
+/// precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// lower 64 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// minimum value between both operands. The upper 64 bits are copied from
+/// the upper 64 bits of the first source operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
}
+/// \brief Performs element-by-element comparison of the two 128-bit vectors of
+/// [2 x double] and returns the vector containing the lesser of each pair of
+/// values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the minimum values
+/// between both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares lower 64-bits double-precision values of both operands, and
+/// returns the greater of the pair of values in the lower 64-bits of the
+/// result. The upper 64 bits of the result are copied from the upper double-
+/// precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// lower 64 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// maximum value between both operands. The upper 64 bits are copied from
+/// the upper 64 bits of the first source operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
}
+/// \brief Performs element-by-element comparison of the two 128-bit vectors of
+/// [2 x double] and returns the vector containing the greater of each pair
+/// of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the maximum values
+/// between both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+/// values between both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_and_pd(__m128d __a, __m128d __b)
{
- return (__m128d)((__v4su)__a & (__v4su)__b);
+ return (__m128d)((__v2du)__a & (__v2du)__b);
}
+/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
+/// the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the left source operand. The
+/// one's complement of this value is used in the bitwise AND.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+/// values in the second operand and the one's complement of the first
+/// operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_andnot_pd(__m128d __a, __m128d __b)
{
- return (__m128d)(~(__v4su)__a & (__v4su)__b);
+ return (__m128d)(~(__v2du)__a & (__v2du)__b);
}
+/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
+/// values between both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_or_pd(__m128d __a, __m128d __b)
{
- return (__m128d)((__v4su)__a | (__v4su)__b);
+ return (__m128d)((__v2du)__a | (__v2du)__b);
}
+/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
+/// values between both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_xor_pd(__m128d __a, __m128d __b)
{
- return (__m128d)((__v4su)__a ^ (__v4su)__b);
+ return (__m128d)((__v2du)__a ^ (__v2du)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0h
+/// for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are less than those in the second operand. Each comparison
+/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are less than or equal to those in the second operand. Each
+/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are greater than those in the second operand. Each comparison
+/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are greater than or equal to those in the second operand. Each
+/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are ordered with respect to those in the second operand. A pair
+/// of double-precision values are "ordered" with respect to each other if
+/// neither value is a NaN. Each comparison yields 0h for false,
+/// FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are unordered with respect to those in the second operand. A pair
+/// of double-precision values are "unordered" with respect to each other if
+/// one or both values are NaN. Each comparison yields 0h for false,
+/// FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are unequal to those in the second operand. Each comparison
+/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are not less than those in the second operand. Each comparison
+/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are not less than or equal to those in the second operand. Each
+/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are not greater than those in the second operand. Each
+/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
}
+/// \brief Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are not greater than or equal to those in the second operand.
+/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_pd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] for equality. The
+/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_sd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than the corresponding value in
+/// the second parameter. The comparison yields 0h for false,
+/// FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_sd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than or equal to the
+/// corresponding value in the second parameter. The comparison yields 0h for
+/// false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_sd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than the corresponding value
+/// in the second parameter. The comparison yields 0h for false,
+/// FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_sd(__m128d __a, __m128d __b)
{
@@ -259,6 +760,24 @@ _mm_cmpgt_sd(__m128d __a, __m128d __b)
return (__m128d) { __c[0], __a[1] };
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than or equal to the
+/// corresponding value in the second parameter. The comparison yields 0h for
+/// false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_sd(__m128d __a, __m128d __b)
{
@@ -266,36 +785,147 @@ _mm_cmpge_sd(__m128d __a, __m128d __b)
return (__m128d) { __c[0], __a[1] };
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is "ordered" with respect to the
+/// corresponding value in the second parameter. The comparison yields 0h for
+/// false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
+/// "ordered" with respect to each other if neither value is a NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_sd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is "unordered" with respect to the
+/// corresponding value in the second parameter. The comparison yields 0h
+/// for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
+/// are "unordered" with respect to each other if one or both values are NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_sd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is unequal to the corresponding value in
+/// the second parameter. The comparison yields 0h for false,
+/// FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_sd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is not less than the corresponding
+/// value in the second parameter. The comparison yields 0h for false,
+/// FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_sd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is not less than or equal to the
+/// corresponding value in the second parameter. The comparison yields 0h
+/// for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_sd(__m128d __a, __m128d __b)
{
return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is not greater than the corresponding
+/// value in the second parameter. The comparison yields 0h for false,
+/// FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_sd(__m128d __a, __m128d __b)
{
@@ -303,6 +933,24 @@ _mm_cmpngt_sd(__m128d __a, __m128d __b)
return (__m128d) { __c[0], __a[1] };
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is not greater than or equal to the
+/// corresponding value in the second parameter. The comparison yields 0h
+/// for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_sd(__m128d __a, __m128d __b)
{
@@ -310,84 +958,317 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
return (__m128d) { __c[0], __a[1] };
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] for equality. The
+/// comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than the corresponding value in
+/// the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than or equal to the
+/// corresponding value in the second parameter. The comparison yields 0 for
+/// false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than the corresponding value
+/// in the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than or equal to the
+/// corresponding value in the second parameter. The comparison yields 0 for
+/// false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is unequal to the corresponding value in
+/// the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] for equality. The
+/// comparison yields 0 for false, 1 for true. If either of the two lower
+/// double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 1 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than the corresponding value in
+/// the second parameter. The comparison yields 0 for false, 1 for true. If
+/// either of the two lower double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 1 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than or equal to the
+/// corresponding value in the second parameter. The comparison yields 0 for
+/// false, 1 for true. If either of the two lower double-precision values is
+/// NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 1 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than the corresponding value
+/// in the second parameter. The comparison yields 0 for false, 1 for true.
+/// If either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomigt_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than or equal to the
+/// corresponding value in the second parameter. The comparison yields 0 for
+/// false, 1 for true. If either of the two lower double-precision values
+/// is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomige_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
}
+/// \brief Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is unequal to the corresponding value in
+/// the second parameter. The comparison yields 0 for false, 1 for true. If
+/// either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison result. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_sd(__m128d __a, __m128d __b)
{
return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
}
+/// \brief Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two single-precision floating-point
+/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
+/// The upper 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+/// converted values. The upper 64 bits are set to zero.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtpd_ps(__m128d __a)
{
return __builtin_ia32_cvtpd2ps((__v2df)__a);
}
+/// \brief Converts the lower two single-precision floating-point elements of a
+/// 128-bit vector of [4 x float] into two double-precision floating-point
+/// values, returned in a 128-bit vector of [2 x double]. The upper two
+/// elements of the input vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower two single-precision
+/// floating-point elements are converted to double-precision values. The
+/// upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtps_pd(__m128 __a)
{
@@ -395,6 +1276,19 @@ _mm_cvtps_pd(__m128 __a)
__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
}
+/// \brief Converts the lower two integer elements of a 128-bit vector of
+/// [4 x i32] into two double-precision floating-point values, returned in a
+/// 128-bit vector of [2 x double]. The upper two elements of the input
+/// vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
+/// converted to double-precision values. The upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtepi32_pd(__m128i __a)
{
@@ -402,24 +1296,84 @@ _mm_cvtepi32_pd(__m128i __a)
__builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
}
+/// \brief Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
+/// 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+/// converted values. The upper 64 bits are set to zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtpd_epi32(__m128d __a)
{
return __builtin_ia32_cvtpd2dq((__v2df)__a);
}
+/// \brief Converts the low-order element of a 128-bit vector of [2 x double]
+/// into a 32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+/// conversion.
+/// \returns A 32-bit signed integer containing the converted value.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtsd_si32(__m128d __a)
{
return __builtin_ia32_cvtsd2si((__v2df)__a);
}
+/// \brief Converts the lower double-precision floating-point element of a
+/// 128-bit vector of [2 x double], in the second parameter, into a
+/// single-precision floating-point value, returned in the lower 32 bits of a
+/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
+/// copied from the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
+/// copied to the upper 96 bits of the result.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision
+/// floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
+/// converted value from the second parameter. The upper 96 bits are copied
+/// from the upper 96 bits of the first parameter.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsd_ss(__m128 __a, __m128d __b)
{
return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
}
+/// \brief Converts a 32-bit signed integer value, in the second parameter, into
+/// a double-precision floating-point value, returned in the lower 64 bits of
+/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+/// are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+/// copied to the upper 64 bits of the result.
+/// \param __b
+/// A 32-bit signed integer containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+/// converted value from the second parameter. The upper 64 bits are copied
+/// from the upper 64 bits of the first parameter.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi32_sd(__m128d __a, int __b)
{
@@ -427,6 +1381,25 @@ _mm_cvtsi32_sd(__m128d __a, int __b)
return __a;
}
+/// \brief Converts the lower single-precision floating-point element of a
+/// 128-bit vector of [4 x float], in the second parameter, into a
+/// double-precision floating-point value, returned in the lower 64 bits of
+/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+/// are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+/// copied to the upper 64 bits of the result.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower single-precision
+/// floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+/// converted value from the second parameter. The upper 64 bits are copied
+/// from the upper 64 bits of the first parameter.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtss_sd(__m128d __a, __m128 __b)
{
@@ -434,48 +1407,145 @@ _mm_cvtss_sd(__m128d __a, __m128 __b)
return __a;
}
+/// \brief Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
+/// result of either conversion is inexact, the result is truncated (rounded
+/// towards zero) regardless of the current MXCSR setting. The upper 64 bits
+/// of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+/// converted values. The upper 64 bits are set to zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttpd_epi32(__m128d __a)
{
return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
}
+/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
+/// signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+/// conversion.
+/// \returns A 32-bit signed integer containing the converted value.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvttsd_si32(__m128d __a)
{
return __builtin_ia32_cvttsd2si((__v2df)__a);
}
+/// \brief Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_cvtpd_pi32(__m128d __a)
{
return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
}
+/// \brief Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in a 64-bit vector of [2 x i32]. If the result of either
+/// conversion is inexact, the result is truncated (rounded towards zero)
+/// regardless of the current MXCSR setting.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_cvttpd_pi32(__m128d __a)
{
return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
}
+/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
+/// [2 x i32] into two double-precision floating-point values, returned in a
+/// 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
+///
+/// \param __a
+/// A 64-bit vector of [2 x i32].
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtpi32_pd(__m64 __a)
{
return __builtin_ia32_cvtpi2pd((__v2si)__a);
}
+/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
+/// a double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
+/// \returns A double-precision floating-point value copied from the lower 64
+/// bits of \a __a.
static __inline__ double __DEFAULT_FN_ATTRS
_mm_cvtsd_f64(__m128d __a)
{
return __a[0];
}
+/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location has to be 16-byte aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_pd(double const *__dp)
{
return *(__m128d*)__dp;
}
+/// \brief Loads a double-precision floating-point value from a specified memory
+/// location and duplicates it to both vector elements of a 128-bit vector of
+/// [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
+///
+/// \param __dp
+/// A pointer to a memory location containing a double-precision value.
+/// \returns A 128-bit vector of [2 x double] containing the loaded and
+/// duplicated values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load1_pd(double const *__dp)
{
@@ -488,6 +1558,20 @@ _mm_load1_pd(double const *__dp)
#define _mm_load_pd1(dp) _mm_load1_pd(dp)
+/// \brief Loads two double-precision values, in reverse order, from an aligned
+/// memory location into a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
+/// needed shuffling instructions. In AVX mode, the shuffling may be combined
+/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
+///
+/// \param __dp
+/// A 16-byte aligned pointer to an array of double-precision values to be
+/// loaded in reverse order.
+/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
+/// values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadr_pd(double const *__dp)
{
@@ -495,6 +1579,17 @@ _mm_loadr_pd(double const *__dp)
return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
}
+/// \brief Loads a 128-bit floating-point vector of [2 x double] from an
+/// unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadu_pd(double const *__dp)
{
@@ -524,6 +1619,23 @@ _mm_load_sd(double const *__dp)
return (__m128d){ __u, 0 };
}
+/// \brief Loads a double-precision value into the high-order bits of a 128-bit
+/// vector of [2 x double]. The low-order bits are copied from the low-order
+/// bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [63:0] are written to bits [63:0] of the result.
+/// \param __dp
+/// A pointer to a 64-bit memory location containing a double-precision
+/// floating-point value that is loaded. The loaded value is written to bits
+/// [127:64] of the result. The address of the memory location does not have
+/// to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadh_pd(__m128d __a, double const *__dp)
{
@@ -534,6 +1646,23 @@ _mm_loadh_pd(__m128d __a, double const *__dp)
return (__m128d){ __a[0], __u };
}
+/// \brief Loads a double-precision value into the low-order bits of a 128-bit
+/// vector of [2 x double]. The high-order bits are copied from the
+/// high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [127:64] are written to bits [127:64] of the result.
+/// \param __dp
+/// A pointer to a 64-bit memory location containing a double-precision
+/// floating-point value that is loaded. The loaded value is written to bits
+/// [63:0] of the result. The address of the memory location does not have to
+/// be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadl_pd(__m128d __a, double const *__dp)
{
@@ -544,48 +1673,149 @@ _mm_loadl_pd(__m128d __a, double const *__dp)
return (__m128d){ __u, __a[1] };
}
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] with
+/// unspecified content. This could be used as an argument to another
+/// intrinsic function where the argument is required but the value is not
+/// actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
+/// content.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_undefined_pd(void)
{
return (__m128d)__builtin_ia32_undef128();
}
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
+/// 64 bits of the vector are initialized with the specified double-precision
+/// floating-point value. The upper 64 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize the lower 64
+/// bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
+/// lower 64 bits contain the value of the parameter. The upper 64 bits are
+/// set to zero.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_sd(double __w)
{
return (__m128d){ __w, 0 };
}
+/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
+/// of the two double-precision floating-point vector elements set to the
+/// specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set1_pd(double __w)
{
return (__m128d){ __w, __w };
}
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]
+/// initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize the upper 64
+/// bits of the result.
+/// \param __x
+/// A double-precision floating-point value used to initialize the lower 64
+/// bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd(double __w, double __x)
{
return (__m128d){ __x, __w };
}
+/// \brief Constructs a 128-bit floating-point vector of [2 x double],
+/// initialized in reverse order with the specified double-precision
+/// floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize the lower 64
+/// bits of the result.
+/// \param __x
+/// A double-precision floating-point value used to initialize the upper 64
+/// bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setr_pd(double __w, double __x)
{
return (__m128d){ __w, __x };
}
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]
+/// initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [2 x double] with
+/// all elements set to zero.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setzero_pd(void)
{
return (__m128d){ 0, 0 };
}
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
+/// 64 bits are set to the lower 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+//
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
+/// upper 64 bits of the result.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
+/// lower 64 bits of the result.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_move_sd(__m128d __a, __m128d __b)
{
return (__m128d){ __b[0], __a[1] };
}
+/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 64-bit memory location.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_sd(double *__dp, __m128d __a)
{
@@ -608,12 +1838,36 @@ _mm_store1_pd(double *__dp, __m128d __a)
_mm_store_pd(__dp, __a);
}
+/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
+/// location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location has to be 16-byte aligned.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_pd1(double *__dp, __m128d __a)
{
return _mm_store1_pd(__dp, __a);
}
+/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
+/// location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_pd(double *__dp, __m128d __a)
{
@@ -623,6 +1877,20 @@ _mm_storeu_pd(double *__dp, __m128d __a)
((struct __storeu_pd*)__dp)->__v = __a;
}
+/// \brief Stores two double-precision values, in reverse order, from a 128-bit
+/// vector of [2 x double] to a 16-byte aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to a shuffling instruction followed by a
+/// <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 16-byte aligned memory location that can store two
+/// double-precision values.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values to be reversed and
+/// stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_pd(double *__dp, __m128d __a)
{
@@ -630,6 +1898,17 @@ _mm_storer_pd(double *__dp, __m128d __a)
*(__m128d *)__dp = __a;
}
+/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 64-bit memory location.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pd(double *__dp, __m128d __a)
{
@@ -639,6 +1918,17 @@ _mm_storeh_pd(double *__dp, __m128d __a)
((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
}
+/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 64-bit memory location.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pd(double *__dp, __m128d __a)
{
@@ -648,127 +1938,391 @@ _mm_storel_pd(double *__dp, __m128d __a)
((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
}
+/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
+/// saving the lower 8 bits of each sum in the corresponding element of a
+/// 128-bit result vector of [16 x i8]. The integer elements of both
+/// parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [16 x i8].
+/// \param __b
+/// A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the sums of both
+/// parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi8(__m128i __a, __m128i __b)
{
return (__m128i)((__v16qu)__a + (__v16qu)__b);
}
+/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
+/// saving the lower 16 bits of each sum in the corresponding element of a
+/// 128-bit result vector of [8 x i16]. The integer elements of both
+/// parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16].
+/// \param __b
+/// A 128-bit vector of [8 x i16].
+/// \returns A 128-bit vector of [8 x i16] containing the sums of both
+/// parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi16(__m128i __a, __m128i __b)
{
return (__m128i)((__v8hu)__a + (__v8hu)__b);
}
+/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
+/// saving the lower 32 bits of each sum in the corresponding element of a
+/// 128-bit result vector of [4 x i32]. The integer elements of both
+/// parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32].
+/// \param __b
+/// A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the sums of both
+/// parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi32(__m128i __a, __m128i __b)
{
return (__m128i)((__v4su)__a + (__v4su)__b);
}
+/// \brief Adds two signed or unsigned 64-bit integer values, returning the
+/// lower 64 bits of the sum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer.
+/// \param __b
+/// A 64-bit integer.
+/// \returns A 64-bit integer containing the sum of both parameters.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_add_si64(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
}
+/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
+/// saving the lower 64 bits of each sum in the corresponding element of a
+/// 128-bit result vector of [2 x i64]. The integer elements of both
+/// parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x i64].
+/// \param __b
+/// A 128-bit vector of [2 x i64].
+/// \returns A 128-bit vector of [2 x i64] containing the sums of both
+/// parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi64(__m128i __a, __m128i __b)
{
return (__m128i)((__v2du)__a + (__v2du)__b);
}
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+/// signed [16 x i8] vectors, saving each sum in the corresponding element of
+/// a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
+/// saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [16 x i8] vector.
+/// \param __b
+/// A 128-bit signed [16 x i8] vector.
+/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
+/// both parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
}
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+/// signed [8 x i16] vectors, saving each sum in the corresponding element of
+/// a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
+/// are saturated to 7FFFh. Negative sums less than 8000h are saturated to
+/// 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
+/// both parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
+/// of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
+/// are saturated to FFh. Negative sums are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+/// A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
+/// of both parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
}
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
+/// of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
+/// are saturated to FFFFh. Negative sums are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+/// A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
+/// of both parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Computes the rounded avarages of corresponding elements of two
+/// 128-bit unsigned [16 x i8] vectors, saving each result in the
+/// corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+/// A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
+/// averages of both parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
}
+/// \brief Computes the rounded avarages of corresponding elements of two
+/// 128-bit unsigned [8 x i16] vectors, saving each result in the
+/// corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+/// A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
+/// averages of both parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
+/// vectors, producing eight intermediate 32-bit signed integer products, and
+/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
+/// [4 x i32] vector. For example, bits [15:0] of both parameters are
+/// multiplied producing a 32-bit product, bits [31:16] of both parameters
+/// are multiplied producing a 32-bit product, and the sum of those two
+/// products becomes bits [31:0] of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
+/// of both parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_madd_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
+/// vectors, saving the greater value from each comparison in the
+/// corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
+/// each comparison.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
+/// vectors, saving the greater value from each comparison in the
+/// corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+/// A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
+/// each comparison.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epu8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
}
+/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
+/// vectors, saving the smaller value from each comparison in the
+/// corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
+/// each comparison.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
+/// vectors, saving the smaller value from each comparison in the
+/// corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+/// A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
+/// each comparison.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epu8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
}
+/// \brief Multiplies the corresponding elements of two signed [8 x i16]
+/// vectors, saving the upper 16 bits of each 32-bit product in the
+/// corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
+/// each of the eight 32-bit products.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
+/// vectors, saving the upper 16 bits of each 32-bit product in the
+/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+/// A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
+/// of each of the eight 32-bit products.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epu16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
}
-/// \brief Multiplies the corresponding elements of two [8 x short] vectors and
-/// returns a vector containing the low-order 16 bits of each 32-bit product
-/// in the corresponding element.
+/// \brief Multiplies the corresponding elements of two signed [8 x i16]
+/// vectors, saving the lower 16 bits of each 32-bit product in the
+/// corresponding element of a 128-bit signed [8 x i16] result vector.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
+/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
///
/// \param __a
-/// A 128-bit integer vector containing one of the source operands.
+/// A 128-bit signed [8 x i16] vector.
/// \param __b
-/// A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the products of both operands.
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
+/// each of the eight 32-bit products.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mullo_epi16(__m128i __a, __m128i __b)
{
@@ -781,7 +2335,7 @@ _mm_mullo_epi16(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMULUDQ instruction.
+/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
///
/// \param __a
/// A 64-bit integer containing one of the source operands.
@@ -800,7 +2354,7 @@ _mm_mul_su32(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
+/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
///
/// \param __a
/// A [2 x i64] vector containing one of the source operands.
@@ -821,7 +2375,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
+/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing one of the source operands.
@@ -839,7 +2393,7 @@ _mm_sad_epu8(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
+/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the minuends.
@@ -857,7 +2411,7 @@ _mm_sub_epi8(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
+/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the minuends.
@@ -875,7 +2429,7 @@ _mm_sub_epi16(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
+/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the minuends.
@@ -894,7 +2448,7 @@ _mm_sub_epi32(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSUBQ instruction.
+/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing the minuend.
@@ -912,7 +2466,7 @@ _mm_sub_si64(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
+/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the minuends.
@@ -933,7 +2487,7 @@ _mm_sub_epi64(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
+/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the minuends.
@@ -954,7 +2508,7 @@ _mm_subs_epi8(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
+/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the minuends.
@@ -974,7 +2528,7 @@ _mm_subs_epi16(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
+/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the minuends.
@@ -994,7 +2548,7 @@ _mm_subs_epu8(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
+/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the minuends.
@@ -1012,7 +2566,7 @@ _mm_subs_epu16(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPAND / PAND instruction.
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing one of the source operands.
@@ -1031,7 +2585,7 @@ _mm_and_si128(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
///
/// \param __a
/// A 128-bit vector containing the left source operand. The one's complement
@@ -1049,7 +2603,7 @@ _mm_andnot_si128(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPOR / POR instruction.
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing one of the source operands.
@@ -1067,7 +2621,7 @@ _mm_or_si128(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing one of the source operands.
@@ -1090,13 +2644,13 @@ _mm_xor_si128(__m128i __a, __m128i __b)
/// __m128i _mm_slli_si128(__m128i a, const int imm);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
///
/// \param a
/// A 128-bit integer vector containing the source operand.
/// \param imm
-/// An immediate value specifying the number of bytes to left-shift
-/// operand a.
+/// An immediate value specifying the number of bytes to left-shift operand
+/// \a a.
/// \returns A 128-bit integer vector containing the left-shifted value.
#define _mm_slli_si128(a, imm) __extension__ ({ \
(__m128i)__builtin_shufflevector( \
@@ -1127,13 +2681,13 @@ _mm_xor_si128(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// An integer value specifying the number of bits to left-shift each value
-/// in operand __a.
+/// in operand \a __a.
/// \returns A 128-bit integer vector containing the left-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi16(__m128i __a, int __count)
@@ -1146,13 +2700,13 @@ _mm_slli_epi16(__m128i __a, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to left-shift each value in operand __a.
+/// to left-shift each value in operand \a __a.
/// \returns A 128-bit integer vector containing the left-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi16(__m128i __a, __m128i __count)
@@ -1165,13 +2719,13 @@ _mm_sll_epi16(__m128i __a, __m128i __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// An integer value specifying the number of bits to left-shift each value
-/// in operand __a.
+/// in operand \a __a.
/// \returns A 128-bit integer vector containing the left-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi32(__m128i __a, int __count)
@@ -1184,13 +2738,13 @@ _mm_slli_epi32(__m128i __a, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to left-shift each value in operand __a.
+/// to left-shift each value in operand \a __a.
/// \returns A 128-bit integer vector containing the left-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi32(__m128i __a, __m128i __count)
@@ -1203,13 +2757,13 @@ _mm_sll_epi32(__m128i __a, __m128i __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// An integer value specifying the number of bits to left-shift each value
-/// in operand __a.
+/// in operand \a __a.
/// \returns A 128-bit integer vector containing the left-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi64(__m128i __a, int __count)
@@ -1222,13 +2776,13 @@ _mm_slli_epi64(__m128i __a, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to left-shift each value in operand __a.
+/// to left-shift each value in operand \a __a.
/// \returns A 128-bit integer vector containing the left-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi64(__m128i __a, __m128i __count)
@@ -1242,13 +2796,13 @@ _mm_sll_epi64(__m128i __a, __m128i __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// An integer value specifying the number of bits to right-shift each value
-/// in operand __a.
+/// in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi16(__m128i __a, int __count)
@@ -1262,13 +2816,13 @@ _mm_srai_epi16(__m128i __a, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand __a.
+/// to right-shift each value in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi16(__m128i __a, __m128i __count)
@@ -1282,13 +2836,13 @@ _mm_sra_epi16(__m128i __a, __m128i __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// An integer value specifying the number of bits to right-shift each value
-/// in operand __a.
+/// in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi32(__m128i __a, int __count)
@@ -1302,13 +2856,13 @@ _mm_srai_epi32(__m128i __a, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand __a.
+/// to right-shift each value in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi32(__m128i __a, __m128i __count)
@@ -1325,13 +2879,13 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
/// __m128i _mm_srli_si128(__m128i a, const int imm);
/// \endcode
///
-/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
///
/// \param a
/// A 128-bit integer vector containing the source operand.
/// \param imm
/// An immediate value specifying the number of bytes to right-shift operand
-/// a.
+/// \a a.
/// \returns A 128-bit integer vector containing the right-shifted value.
#define _mm_srli_si128(a, imm) __extension__ ({ \
(__m128i)__builtin_shufflevector( \
@@ -1362,13 +2916,13 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// An integer value specifying the number of bits to right-shift each value
-/// in operand __a.
+/// in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi16(__m128i __a, int __count)
@@ -1381,13 +2935,13 @@ _mm_srli_epi16(__m128i __a, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand __a.
+/// to right-shift each value in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi16(__m128i __a, __m128i __count)
@@ -1400,13 +2954,13 @@ _mm_srl_epi16(__m128i __a, __m128i __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// An integer value specifying the number of bits to right-shift each value
-/// in operand __a.
+/// in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi32(__m128i __a, int __count)
@@ -1419,13 +2973,13 @@ _mm_srli_epi32(__m128i __a, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand __a.
+/// to right-shift each value in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi32(__m128i __a, __m128i __count)
@@ -1438,13 +2992,13 @@ _mm_srl_epi32(__m128i __a, __m128i __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// An integer value specifying the number of bits to right-shift each value
-/// in operand __a.
+/// in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi64(__m128i __a, int __count)
@@ -1457,13 +3011,13 @@ _mm_srli_epi64(__m128i __a, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
///
/// \param __a
/// A 128-bit integer vector containing the source operand.
/// \param __count
/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand __a.
+/// to right-shift each value in operand \a __a.
/// \returns A 128-bit integer vector containing the right-shifted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi64(__m128i __a, __m128i __count)
@@ -1477,7 +3031,7 @@ _mm_srl_epi64(__m128i __a, __m128i __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1496,7 +3050,7 @@ _mm_cmpeq_epi8(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1515,7 +3069,7 @@ _mm_cmpeq_epi16(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
+/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1535,7 +3089,7 @@ _mm_cmpeq_epi32(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1557,7 +3111,7 @@ _mm_cmpgt_epi8(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1577,7 +3131,7 @@ _mm_cmpgt_epi16(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1597,7 +3151,7 @@ _mm_cmpgt_epi32(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1617,7 +3171,7 @@ _mm_cmplt_epi8(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1637,7 +3191,7 @@ _mm_cmplt_epi16(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1658,7 +3212,7 @@ _mm_cmplt_epi32(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
@@ -1680,7 +3234,7 @@ _mm_cvtsi64_sd(__m128d __a, long long __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
@@ -1697,7 +3251,8 @@ _mm_cvtsd_si64(__m128d __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+/// instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
@@ -1714,7 +3269,7 @@ _mm_cvttsd_si64(__m128d __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
///
/// \param __a
/// A 128-bit integer vector.
@@ -1729,7 +3284,7 @@ _mm_cvtepi32_ps(__m128i __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1746,7 +3301,8 @@ _mm_cvtps_epi32(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
+/// instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1762,7 +3318,7 @@ _mm_cvttps_epi32(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
///
/// \param __a
/// A 32-bit signed integer operand.
@@ -1779,7 +3335,7 @@ _mm_cvtsi32_si128(int __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
///
/// \param __a
/// A 64-bit signed integer operand containing the value to be converted.
@@ -1796,7 +3352,7 @@ _mm_cvtsi64_si128(long long __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
///
/// \param __a
/// A vector of [4 x i32]. The least significant 32 bits are moved to the
@@ -1815,7 +3371,7 @@ _mm_cvtsi128_si32(__m128i __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
///
/// \param __a
/// A vector of [2 x i64]. The least significant 64 bits are moved to the
@@ -1833,7 +3389,7 @@ _mm_cvtsi128_si64(__m128i __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
+/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
///
/// \param __p
/// An aligned pointer to a memory location containing integer values.
@@ -1849,7 +3405,7 @@ _mm_load_si128(__m128i const *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
+/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
///
/// \param __p
/// A pointer to a memory location containing integer values.
@@ -1868,7 +3424,7 @@ _mm_loadu_si128(__m128i const *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
///
/// \param __p
/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
@@ -2154,42 +3710,170 @@ _mm_set1_epi8(char __b)
return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
}
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+/// instruction.
+///
+/// \param __q0
+/// A 64-bit integral value used to initialize the lower 64 bits of the
+/// result.
+/// \param __q1
+/// A 64-bit integral value used to initialize the upper 64 bits of the
+/// result.
+/// \returns An initialized 128-bit integer vector.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi64(__m64 __q0, __m64 __q1)
{
return (__m128i){ (long long)__q0, (long long)__q1 };
}
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i0
+/// A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+/// A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+/// A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+/// A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \returns An initialized 128-bit integer vector.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
{
return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
}
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w0
+/// A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w1
+/// A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w2
+/// A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w3
+/// A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w4
+/// A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w5
+/// A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w6
+/// A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w7
+/// A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \returns An initialized 128-bit integer vector.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
{
return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
}
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b0
+/// An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b1
+/// An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b2
+/// An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b3
+/// An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b4
+/// An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b5
+/// An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b6
+/// An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b7
+/// An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b8
+/// An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b9
+/// An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b10
+/// An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b11
+/// An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b12
+/// An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b13
+/// An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b14
+/// An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b15
+/// An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \returns An initialized 128-bit integer vector.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
{
return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
}
+/// \brief Creates a 128-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit integer vector with all elements set to
+/// zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setzero_si128(void)
{
return (__m128i){ 0LL, 0LL };
}
+/// \brief Stores a 128-bit integer vector to a memory location aligned on a
+/// 128-bit boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+/// A pointer to an aligned memory location that will receive the integer
+/// values.
+/// \param __b
+/// A 128-bit integer vector containing the values to be moved.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_si128(__m128i *__p, __m128i __b)
{
*__p = __b;
}
+/// \brief Stores a 128-bit integer vector to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the integer values.
+/// \param __b
+/// A 128-bit integer vector containing the values to be moved.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si128(__m128i *__p, __m128i __b)
{
@@ -2199,12 +3883,45 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
((struct __storeu_si128*)__p)->__v = __b;
}
+/// \brief Moves bytes selected by the mask from the first operand to the
+/// specified unaligned memory location. When a mask bit is 1, the
+/// corresponding byte is written, otherwise it is not written. To minimize
+/// caching, the date is flagged as non-temporal (unlikely to be used again
+/// soon). Exception and trap behavior for elements not selected for storage
+/// to memory are implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
+/// instruction.
+///
+/// \param __d
+/// A 128-bit integer vector containing the values to be moved.
+/// \param __n
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each byte represents the mask bits.
+/// \param __p
+/// A pointer to an unaligned 128-bit memory location where the specified
+/// values are moved.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
{
__builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
}
+/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
+/// a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 64-bit memory location that will receive the lower 64 bits
+/// of the integer vector parameter.
+/// \param __a
+/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
+/// value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_epi64(__m128i *__p, __m128i __a)
{
@@ -2214,18 +3931,54 @@ _mm_storel_epi64(__m128i *__p, __m128i __a)
((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
}
+/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
+/// aligned memory location. To minimize caching, the data is flagged as
+/// non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+/// A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+/// A vector of [2 x double] containing the 64-bit values to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pd(double *__p, __m128d __a)
{
__builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
}
+/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+/// A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+/// A 128-bit integer vector containing the values to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_si128(__m128i *__p, __m128i __a)
{
__builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
}
+/// \brief Stores a 32-bit integer value in the specified memory location. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
+///
+/// \param __p
+/// A pointer to the 32-bit memory location used to store the value.
+/// \param __a
+/// A 32-bit integer containing the value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_si32(int *__p, int __a)
{
@@ -2233,6 +3986,18 @@ _mm_stream_si32(int *__p, int __a)
}
#ifdef __x86_64__
+/// \brief Stores a 64-bit integer value in the specified memory location. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
+///
+/// \param __p
+/// A pointer to the 64-bit memory location used to store the value.
+/// \param __a
+/// A 64-bit integer containing the value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_si64(long long *__p, long long __a)
{
@@ -2240,42 +4005,154 @@ _mm_stream_si64(long long *__p, long long __a)
}
#endif
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflush(void const *__p)
-{
- __builtin_ia32_clflush(__p);
-}
+#if defined(__cplusplus)
+extern "C" {
+#endif
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_lfence(void)
-{
- __builtin_ia32_lfence();
-}
+/// \brief The cache line containing \a __p is flushed and invalidated from all
+/// caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
+///
+/// \param __p
+/// A pointer to the memory location used to identify the cache line to be
+/// flushed.
+void _mm_clflush(void const *);
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_mfence(void)
-{
- __builtin_ia32_mfence();
-}
+/// \brief Forces strong memory ordering (serialization) between load
+/// instructions preceding this instruction and load instructions following
+/// this instruction, ensuring the system completes all previous loads before
+/// executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
+///
+void _mm_lfence(void);
+/// \brief Forces strong memory ordering (serialization) between load and store
+/// instructions preceding this instruction and load and store instructions
+/// following this instruction, ensuring that the system completes all
+/// previous memory accesses before executing subsequent memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
+///
+void _mm_mfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+/// operands into 8-bit signed integers, and packs the results into the
+/// destination. Positive values greater than 0x7F are saturated to 0x7F.
+/// Negative values less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+/// a signed integer and is converted to a 8-bit signed integer with
+/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+/// written to the lower 64 bits of the result.
+/// \param __b
+/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+/// a signed integer and is converted to a 8-bit signed integer with
+/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+/// written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+/// operands into 16-bit signed integers, and packs the results into the
+/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+/// Negative values less than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+/// a signed integer and is converted to a 16-bit signed integer with
+/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+/// are written to the lower 64 bits of the result.
+/// \param __b
+/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+/// a signed integer and is converted to a 16-bit signed integer with
+/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+/// are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
}
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+/// operands into 8-bit unsigned integers, and packs the results into the
+/// destination. Values greater than 0xFF are saturated to 0xFF. Values less
+/// than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+/// a signed integer and is converted to an 8-bit unsigned integer with
+/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+/// written to the lower 64 bits of the result.
+/// \param __b
+/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+/// a signed integer and is converted to an 8-bit unsigned integer with
+/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+/// written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packus_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
}
+/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
+/// the immediate-value parameter as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __imm
+/// An immediate value. Bits [3:0] selects values from \a __a to be assigned
+/// to bits[15:0] of the result. \n
+/// 000: assign values from bits [15:0] of \a __a. \n
+/// 001: assign values from bits [31:16] of \a __a. \n
+/// 010: assign values from bits [47:32] of \a __a. \n
+/// 011: assign values from bits [63:48] of \a __a. \n
+/// 100: assign values from bits [79:64] of \a __a. \n
+/// 101: assign values from bits [95:80] of \a __a. \n
+/// 110: assign values from bits [111:96] of \a __a. \n
+/// 111: assign values from bits [127:112] of \a __a.
+/// \returns An integer, whose lower 16 bits are selected from the 128-bit
+/// integer vector parameter and the remaining bits are assigned zeros.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_extract_epi16(__m128i __a, int __imm)
{
@@ -2283,6 +4160,26 @@ _mm_extract_epi16(__m128i __a, int __imm)
return (unsigned short)__b[__imm & 7];
}
+/// \brief Constructs a 128-bit integer vector by first making a copy of the
+/// 128-bit integer vector parameter, and then inserting the lower 16 bits
+/// of an integer parameter into an offset specified by the immediate-value
+/// parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
+/// result and then one of the eight elements in the result is replaced by
+/// the lower 16 bits of \a __b.
+/// \param __b
+/// An integer. The lower 16 bits of this parameter are written to the
+/// result beginning at an offset specified by \a __imm.
+/// \param __imm
+/// An immediate value specifying the bit offset in the result at which the
+/// lower 16 bits of \a __b are written.
+/// \returns A 128-bit integer vector containing the constructed values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_insert_epi16(__m128i __a, int __b, int __imm)
{
@@ -2291,18 +4188,85 @@ _mm_insert_epi16(__m128i __a, int __b, int __imm)
return (__m128i)__c;
}
+/// \brief Copies the values of the most significant bits from each 8-bit
+/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
+/// value, zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bits from each 8-bit element in \a __a,
+/// written to bits [15:0]. The other bits are assigned zeros.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_epi8(__m128i __a)
{
return __builtin_ia32_pmovmskb128((__v16qi)__a);
}
+/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
+/// elements of a 128-bit integer vector parameter, using the immediate-value
+/// parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+///
+/// \param a
+/// A 128-bit integer vector containing the values to be copied.
+/// \param imm
+/// An immediate value containing an 8-bit value specifying which elements to
+/// copy from a. The destinations within the 128-bit destination are assigned
+/// values as follows: \n
+/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
+/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
+/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
+/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
+/// Bit value assignments: \n
+/// 00: assign values from bits [31:0] of \a a. \n
+/// 01: assign values from bits [63:32] of \a a. \n
+/// 10: assign values from bits [95:64] of \a a. \n
+/// 11: assign values from bits [127:96] of \a a.
+/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
(__v4si)_mm_undefined_si128(), \
((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
+/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
+/// elements of a 128-bit integer vector of [8 x i16], using the immediate
+/// value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+///
+/// \param a
+/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
+/// [127:64] of the result.
+/// \param imm
+/// An 8-bit immediate value specifying which elements to copy from \a a. \n
+/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
+/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
+/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
+/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
+/// Bit value assignments: \n
+/// 00: assign values from bits [15:0] of \a a. \n
+/// 01: assign values from bits [31:16] of \a a. \n
+/// 10: assign values from bits [47:32] of \a a. \n
+/// 11: assign values from bits [63:48] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
(__v8hi)_mm_undefined_si128(), \
@@ -2310,6 +4274,33 @@ _mm_movemask_epi8(__m128i __a)
((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
4, 5, 6, 7); })
+/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
+/// elements of a 128-bit integer vector of [8 x i16], using the immediate
+/// value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
+///
+/// \param a
+/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
+/// [63:0] of the result.
+/// \param imm
+/// An 8-bit immediate value specifying which elements to copy from \a a. \n
+/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
+/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
+/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
+/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
+/// Bit value assignments: \n
+/// 00: assign values from bits [79:64] of \a a. \n
+/// 01: assign values from bits [95:80] of \a a. \n
+/// 10: assign values from bits [111:96] of \a a. \n
+/// 11: assign values from bits [127:112] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
(__v8hi)_mm_undefined_si128(), \
@@ -2319,137 +4310,480 @@ _mm_movemask_epi8(__m128i __a)
4 + (((imm) >> 4) & 0x3), \
4 + (((imm) >> 6) & 0x3)); })
+/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
+/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [16 x i8].
+/// Bits [71:64] are written to bits [7:0] of the result. \n
+/// Bits [79:72] are written to bits [23:16] of the result. \n
+/// Bits [87:80] are written to bits [39:32] of the result. \n
+/// Bits [95:88] are written to bits [55:48] of the result. \n
+/// Bits [103:96] are written to bits [71:64] of the result. \n
+/// Bits [111:104] are written to bits [87:80] of the result. \n
+/// Bits [119:112] are written to bits [103:96] of the result. \n
+/// Bits [127:120] are written to bits [119:112] of the result.
+/// \param __b
+/// A 128-bit vector of [16 x i8]. \n
+/// Bits [71:64] are written to bits [15:8] of the result. \n
+/// Bits [79:72] are written to bits [31:24] of the result. \n
+/// Bits [87:80] are written to bits [47:40] of the result. \n
+/// Bits [95:88] are written to bits [63:56] of the result. \n
+/// Bits [103:96] are written to bits [79:72] of the result. \n
+/// Bits [111:104] are written to bits [95:88] of the result. \n
+/// Bits [119:112] are written to bits [111:104] of the result. \n
+/// Bits [127:120] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
}
+/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
+/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16].
+/// Bits [79:64] are written to bits [15:0] of the result. \n
+/// Bits [95:80] are written to bits [47:32] of the result. \n
+/// Bits [111:96] are written to bits [79:64] of the result. \n
+/// Bits [127:112] are written to bits [111:96] of the result.
+/// \param __b
+/// A 128-bit vector of [8 x i16].
+/// Bits [79:64] are written to bits [31:16] of the result. \n
+/// Bits [95:80] are written to bits [63:48] of the result. \n
+/// Bits [111:96] are written to bits [95:80] of the result. \n
+/// Bits [127:112] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
}
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32]. \n
+/// Bits [95:64] are written to bits [31:0] of the destination. \n
+/// Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+/// A 128-bit vector of [4 x i32]. \n
+/// Bits [95:64] are written to bits [64:32] of the destination. \n
+/// Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
}
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+/// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x i64]. \n
+/// Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+/// A 128-bit vector of [2 x i64]. \n
+/// Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi64(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
}
+/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [16 x i8]. \n
+/// Bits [7:0] are written to bits [7:0] of the result. \n
+/// Bits [15:8] are written to bits [23:16] of the result. \n
+/// Bits [23:16] are written to bits [39:32] of the result. \n
+/// Bits [31:24] are written to bits [55:48] of the result. \n
+/// Bits [39:32] are written to bits [71:64] of the result. \n
+/// Bits [47:40] are written to bits [87:80] of the result. \n
+/// Bits [55:48] are written to bits [103:96] of the result. \n
+/// Bits [63:56] are written to bits [119:112] of the result.
+/// \param __b
+/// A 128-bit vector of [16 x i8].
+/// Bits [7:0] are written to bits [15:8] of the result. \n
+/// Bits [15:8] are written to bits [31:24] of the result. \n
+/// Bits [23:16] are written to bits [47:40] of the result. \n
+/// Bits [31:24] are written to bits [63:56] of the result. \n
+/// Bits [39:32] are written to bits [79:72] of the result. \n
+/// Bits [47:40] are written to bits [95:88] of the result. \n
+/// Bits [55:48] are written to bits [111:104] of the result. \n
+/// Bits [63:56] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
}
+/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
+/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
+/// [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16].
+/// Bits [15:0] are written to bits [15:0] of the result. \n
+/// Bits [31:16] are written to bits [47:32] of the result. \n
+/// Bits [47:32] are written to bits [79:64] of the result. \n
+/// Bits [63:48] are written to bits [111:96] of the result.
+/// \param __b
+/// A 128-bit vector of [8 x i16].
+/// Bits [15:0] are written to bits [31:16] of the result. \n
+/// Bits [31:16] are written to bits [63:48] of the result. \n
+/// Bits [47:32] are written to bits [95:80] of the result. \n
+/// Bits [63:48] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
}
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32]. \n
+/// Bits [31:0] are written to bits [31:0] of the destination. \n
+/// Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+/// A 128-bit vector of [4 x i32]. \n
+/// Bits [31:0] are written to bits [64:32] of the destination. \n
+/// Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
}
+/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
+/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x i64]. \n
+/// Bits [63:0] are written to bits [63:0] of the destination. \n
+/// \param __b
+/// A 128-bit vector of [2 x i64]. \n
+/// Bits [63:0] are written to bits [127:64] of the destination. \n
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi64(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
}
+/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+/// integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector operand. The lower 64 bits are moved to the
+/// destination.
+/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_movepi64_pi64(__m128i __a)
{
return (__m64)__a[0];
}
+/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+/// upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
+///
+/// \param __a
+/// A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+/// the operand. The upper 64 bits are assigned zeros.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_movpi64_epi64(__m64 __a)
{
return (__m128i){ (long long)__a, 0 };
}
+/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+/// integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector operand. The lower 64 bits are moved to the
+/// destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+/// the operand. The upper 64 bits are assigned zeros.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_move_epi64(__m128i __a)
{
return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
}
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
+/// double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpackhi_pd(__m128d __a, __m128d __b)
{
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
}
+/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
+/// double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpacklo_pd(__m128d __a, __m128d __b)
{
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
}
+/// \brief Extracts the sign bits of the double-precision values in the 128-bit
+/// vector of [2 x double], zero-extends the value, and writes it to the
+/// low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values with sign bits to
+/// be extracted.
+/// \returns The sign bits from each of the double-precision elements in \a __a,
+/// written to bits [1:0]. The remaining bits are assigned values of zero.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_pd(__m128d __a)
{
return __builtin_ia32_movmskpd((__v2df)__a);
}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
+/// 128-bit vector parameters of [2 x double], using the immediate-value
+/// parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
+///
+/// \param a
+/// A 128-bit vector of [2 x double].
+/// \param b
+/// A 128-bit vector of [2 x double].
+/// \param i
+/// An 8-bit immediate value. The least significant two bits specify which
+/// elements to copy from a and b: \n
+/// Bit[0] = 0: lower element of a copied to lower element of result. \n
+/// Bit[0] = 1: upper element of a copied to lower element of result. \n
+/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
+/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
0 + (((i) >> 0) & 0x1), \
2 + (((i) >> 1) & 0x1)); })
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+/// floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+/// bitwise pattern as the parameter.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castpd_ps(__m128d __a)
{
return (__m128)__a;
}
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+/// parameter.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castpd_si128(__m128d __a)
{
return (__m128i)__a;
}
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+/// floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+/// bitwise pattern as the parameter.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castps_pd(__m128 __a)
{
return (__m128d)__a;
}
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+/// parameter.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castps_si128(__m128 __a)
{
return (__m128i)__a;
}
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+/// bitwise pattern as the parameter.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castsi128_ps(__m128i __a)
{
return (__m128)__a;
}
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+/// bitwise pattern as the parameter.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castsi128_pd(__m128i __a)
{
return (__m128d)__a;
}
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_pause(void)
-{
- __builtin_ia32_pause();
-}
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/// \brief Indicates that a spin loop is being executed for the purposes of
+/// optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
+///
+void _mm_pause(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
#undef __DEFAULT_FN_ATTRS
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
diff --git a/lib/Headers/f16cintrin.h b/lib/Headers/f16cintrin.h
index 415bf732fb9f..180712ffc680 100644
--- a/lib/Headers/f16cintrin.h
+++ b/lib/Headers/f16cintrin.h
@@ -37,7 +37,7 @@
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTPH2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 16-bit half-precision float value.
@@ -59,17 +59,17 @@ _cvtsh_ss(unsigned short __a)
/// unsigned short _cvtss_sh(float a, const int imm);
/// \endcode
///
-/// This intrinsic corresponds to the \c VCVTPS2PH instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 32-bit single-precision float value to be converted to a 16-bit
/// half-precision float value.
/// \param imm
-/// An immediate value controlling rounding using bits [2:0]:
-/// 000: Nearest
-/// 001: Down
-/// 010: Up
-/// 011: Truncate
+/// An immediate value controlling rounding using bits [2:0]: \n
+/// 000: Nearest \n
+/// 001: Down \n
+/// 010: Up \n
+/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) \
@@ -85,16 +85,16 @@ _cvtsh_ss(unsigned short __a)
/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
/// \endcode
///
-/// This intrinsic corresponds to the \c VCVTPS2PH instruction.
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 128-bit vector containing 32-bit float values.
/// \param imm
-/// An immediate value controlling rounding using bits [2:0]:
-/// 000: Nearest
-/// 001: Down
-/// 010: Up
-/// 011: Truncate
+/// An immediate value controlling rounding using bits [2:0]: \n
+/// 000: Nearest \n
+/// 001: Down \n
+/// 010: Up \n
+/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing converted 16-bit half-precision float
/// values. The lower 64 bits are used to store the converted 16-bit
@@ -107,7 +107,7 @@ _cvtsh_ss(unsigned short __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTPH2PS instruction.
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values. The lower
diff --git a/lib/Headers/float.h b/lib/Headers/float.h
index a28269ebebbe..0f453d87cbcb 100644
--- a/lib/Headers/float.h
+++ b/lib/Headers/float.h
@@ -27,9 +27,12 @@
/* If we're on MinGW, fall back to the system's float.h, which might have
* additional definitions provided for Windows.
* For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ *
+ * Also fall back on Darwin to allow additional definitions and
+ * implementation-defined values.
*/
-#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \
- __has_include_next(<float.h>)
+#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
+ __STDC_HOSTED__ && __has_include_next(<float.h>)
# include_next <float.h>
/* Undefine anything that we'll be redefining below. */
diff --git a/lib/Headers/fxsrintrin.h b/lib/Headers/fxsrintrin.h
index ac6026aa5ba2..786081ca8eab 100644
--- a/lib/Headers/fxsrintrin.h
+++ b/lib/Headers/fxsrintrin.h
@@ -30,25 +30,75 @@
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr")))
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+/// memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
+///
+/// \param __p
+/// A pointer to a 512-byte memory region. The beginning of this memory
+/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave(void *__p) {
+_fxsave(void *__p)
+{
return __builtin_ia32_fxsave(__p);
}
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+/// memory region pointed to by the input parameter \a __p. The contents of
+/// this memory region should have been written to by a previous \c _fxsave
+/// or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
+///
+/// \param __p
+/// A pointer to a 512-byte memory region. The beginning of this memory
+/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave64(void *__p) {
- return __builtin_ia32_fxsave64(__p);
+_fxrstor(void *__p)
+{
+ return __builtin_ia32_fxrstor(__p);
}
+#ifdef __x86_64__
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+/// memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
+///
+/// \param __p
+/// A pointer to a 512-byte memory region. The beginning of this memory
+/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor(void *__p) {
- return __builtin_ia32_fxrstor(__p);
+_fxsave64(void *__p)
+{
+ return __builtin_ia32_fxsave64(__p);
}
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+/// memory region pointed to by the input parameter \a __p. The contents of
+/// this memory region should have been written to by a previous \c _fxsave
+/// or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
+///
+/// \param __p
+/// A pointer to a 512-byte memory region. The beginning of this memory
+/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor64(void *__p) {
+_fxrstor64(void *__p)
+{
return __builtin_ia32_fxrstor64(__p);
}
+#endif
#undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/ia32intrin.h b/lib/Headers/ia32intrin.h
index 397f3fd13e01..4928300103ad 100644
--- a/lib/Headers/ia32intrin.h
+++ b/lib/Headers/ia32intrin.h
@@ -60,12 +60,6 @@ __rdpmc(int __A) {
return __builtin_ia32_rdpmc(__A);
}
-/* __rdtsc */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
-__rdtsc(void) {
- return __builtin_ia32_rdtsc();
-}
-
/* __rdtscp */
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
__rdtscp(unsigned int *__A) {
diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h
index 4b2752353d6f..7f91d49fbcec 100644
--- a/lib/Headers/immintrin.h
+++ b/lib/Headers/immintrin.h
@@ -69,9 +69,44 @@
Intel documents these as being in immintrin.h, and
they depend on typedefs from avxintrin.h. */
+/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
+/// containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+/// A 256-bit vector containing 32-bit single-precision float values to be
+/// converted to 16-bit half-precision float values.
+/// \param imm
+/// An immediate value controlling rounding using bits [2:0]: \n
+/// 000: Nearest \n
+/// 001: Down \n
+/// 010: Up \n
+/// 011: Truncate \n
+/// 1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+/// float values.
#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+/// values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector containing 16-bit half-precision float values to be
+/// converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+/// single-precision float values.
static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
_mm256_cvtph_ps(__m128i __a)
{
diff --git a/lib/Headers/intrin.h b/lib/Headers/intrin.h
index f18711ad1ecf..7c91ebaee8cb 100644
--- a/lib/Headers/intrin.h
+++ b/lib/Headers/intrin.h
@@ -34,6 +34,10 @@
#include <x86intrin.h>
#endif
+#if defined(__arm__)
+#include <armintr.h>
+#endif
+
/* For the definition of jmp_buf. */
#if __STDC_HOSTED__
#include <setjmp.h>
@@ -62,7 +66,9 @@ void __cpuid(int[4], int);
static __inline__
void __cpuidex(int[4], int, int);
void __debugbreak(void);
+static __inline__
__int64 __emul(int, int);
+static __inline__
unsigned __int64 __emulu(unsigned int, unsigned int);
void __cdecl __fastfail(unsigned int);
unsigned int __getcallerseflags(void);
@@ -93,6 +99,7 @@ static __inline__
void __movsd(unsigned long *, unsigned long const *, size_t);
static __inline__
void __movsw(unsigned short *, unsigned short const *, size_t);
+static __inline__
void __nop(void);
void __nvreg_restore_fence(void);
void __nvreg_save_fence(void);
@@ -249,10 +256,12 @@ static __inline__
unsigned long __cdecl _lrotl(unsigned long, int);
static __inline__
unsigned long __cdecl _lrotr(unsigned long, int);
-static __inline__
-void _ReadBarrier(void);
-static __inline__
-void _ReadWriteBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void);
static __inline__
void *_ReturnAddress(void);
unsigned int _rorx_u32(unsigned int, const unsigned int);
@@ -281,8 +290,9 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
void _Store_HLERelease(long volatile *, long);
void _Store64_HLERelease(__int64 volatile *, __int64);
void _StorePointer_HLERelease(void *volatile *, void *);
-static __inline__
-void _WriteBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void);
unsigned __int32 xbegin(void);
void _xend(void);
static __inline__
@@ -307,7 +317,6 @@ void __lwpval64(unsigned __int64, unsigned int, unsigned int);
unsigned __int64 __lzcnt64(unsigned __int64);
static __inline__
void __movsq(unsigned long long *, unsigned long long const *, size_t);
-__int64 __mulh(__int64, __int64);
static __inline__
unsigned __int64 __popcnt64(unsigned __int64);
static __inline__
@@ -378,30 +387,15 @@ void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
void *_Exchange, void *_Comparand);
void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
void *_Exchange, void *_Comparand);
-static __inline__
-__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-static __inline__
-__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-static __inline__
-__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
-static __inline__
-__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
long _InterlockedOr_np(long volatile *_Value, long _Mask);
short _InterlockedOr16_np(short volatile *_Value, short _Mask);
-static __inline__
-__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
char _InterlockedOr8_np(char volatile *_Value, char _Mask);
long _InterlockedXor_np(long volatile *_Value, long _Mask);
short _InterlockedXor16_np(short volatile *_Value, short _Mask);
-static __inline__
-__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
char _InterlockedXor8_np(char volatile *_Value, char _Mask);
-static __inline__
-__int64 _mul128(__int64 _Multiplier, __int64 _Multiplicand,
- __int64 *_HighProduct);
unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
__int64 _sarx_i64(__int64, unsigned int);
#if __STDC_HOSTED__
@@ -409,119 +403,44 @@ int __cdecl _setjmpex(jmp_buf);
#endif
unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
-/*
- * Multiply two 64-bit integers and obtain a 64-bit result.
- * The low-half is returned directly and the high half is in an out parameter.
- */
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_umul128(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand,
- unsigned __int64 *_HighProduct) {
- unsigned __int128 _FullProduct =
- (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
- *_HighProduct = _FullProduct >> 64;
- return _FullProduct;
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__umulh(unsigned __int64 _Multiplier, unsigned __int64 _Multiplicand) {
- unsigned __int128 _FullProduct =
- (unsigned __int128)_Multiplier * (unsigned __int128)_Multiplicand;
- return _FullProduct >> 64;
-}
+static __inline__
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
+static __inline__
+__int64 _mul128(__int64, __int64, __int64*);
+static __inline__
+unsigned __int64 _umul128(unsigned __int64,
+ unsigned __int64,
+ unsigned __int64*);
#endif /* __x86_64__ */
-/*----------------------------------------------------------------------------*\
-|* Multiplication
-\*----------------------------------------------------------------------------*/
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-__emul(int __in1, int __in2) {
- return (__int64)__in1 * (__int64)__in2;
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__emulu(unsigned int __in1, unsigned int __in2) {
- return (unsigned __int64)__in1 * (unsigned __int64)__in2;
-}
-/*----------------------------------------------------------------------------*\
-|* Bit Twiddling
-\*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_rotl8(unsigned char _Value, unsigned char _Shift) {
- _Shift &= 0x7;
- return _Shift ? (_Value << _Shift) | (_Value >> (8 - _Shift)) : _Value;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_rotr8(unsigned char _Value, unsigned char _Shift) {
- _Shift &= 0x7;
- return _Shift ? (_Value >> _Shift) | (_Value << (8 - _Shift)) : _Value;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-_rotl16(unsigned short _Value, unsigned char _Shift) {
- _Shift &= 0xf;
- return _Shift ? (_Value << _Shift) | (_Value >> (16 - _Shift)) : _Value;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-_rotr16(unsigned short _Value, unsigned char _Shift) {
- _Shift &= 0xf;
- return _Shift ? (_Value >> _Shift) | (_Value << (16 - _Shift)) : _Value;
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rotl(unsigned int _Value, int _Shift) {
- _Shift &= 0x1f;
- return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rotr(unsigned int _Value, int _Shift) {
- _Shift &= 0x1f;
- return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-_lrotl(unsigned long _Value, int _Shift) {
- _Shift &= 0x1f;
- return _Shift ? (_Value << _Shift) | (_Value >> (32 - _Shift)) : _Value;
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-_lrotr(unsigned long _Value, int _Shift) {
- _Shift &= 0x1f;
- return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
-}
-static
-__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_rotl64(unsigned __int64 _Value, int _Shift) {
- _Shift &= 0x3f;
- return _Shift ? (_Value << _Shift) | (_Value >> (64 - _Shift)) : _Value;
-}
-static
-__inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-_rotr64(unsigned __int64 _Value, int _Shift) {
- _Shift &= 0x3f;
- return _Shift ? (_Value >> _Shift) | (_Value << (64 - _Shift)) : _Value;
-}
+#if defined(__x86_64__) || defined(__arm__)
+
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
+
+#endif
+
/*----------------------------------------------------------------------------*\
|* Bit Counting and Testing
\*----------------------------------------------------------------------------*/
static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanForward(unsigned long *_Index, unsigned long _Mask) {
- if (!_Mask)
- return 0;
- *_Index = __builtin_ctzl(_Mask);
- return 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanReverse(unsigned long *_Index, unsigned long _Mask) {
- if (!_Mask)
- return 0;
- *_Index = 31 - __builtin_clzl(_Mask);
- return 1;
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__popcnt16(unsigned short _Value) {
- return __builtin_popcount((int)_Value);
-}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__popcnt(unsigned int _Value) {
- return __builtin_popcount(_Value);
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittest(long const *_BitBase, long _BitPos) {
return (*_BitBase >> _BitPos) & 1;
}
@@ -548,26 +467,24 @@ _interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
return (_PrevVal >> _BitPos) & 1;
}
-#ifdef __x86_64__
+#if defined(__arm__) || defined(__aarch64__)
static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask) {
- if (!_Mask)
- return 0;
- *_Index = __builtin_ctzll(_Mask);
- return 1;
+_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
+ long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE);
+ return (_PrevVal >> _BitPos) & 1;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask) {
- if (!_Mask)
- return 0;
- *_Index = 63 - __builtin_clzll(_Mask);
- return 1;
+_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) {
+ long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED);
+ return (_PrevVal >> _BitPos) & 1;
}
-static __inline__
-unsigned __int64 __DEFAULT_FN_ATTRS
-__popcnt64(unsigned __int64 _Value) {
- return __builtin_popcountll(_Value);
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) {
+ long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE);
+ return (_PrevVal >> _BitPos) & 1;
}
+#endif
+#ifdef __x86_64__
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
return (*_BitBase >> _BitPos) & 1;
@@ -600,196 +517,449 @@ _interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
/*----------------------------------------------------------------------------*\
|* Interlocked Exchange Add
\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd8(char volatile *_Addend, char _Value) {
- return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
}
-static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd16(short volatile *_Addend, short _Value) {
- return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
-}
-#ifdef __x86_64__
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value) {
- return __atomic_fetch_add(_Addend, _Value, __ATOMIC_SEQ_CST);
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange Sub
-\*----------------------------------------------------------------------------*/
static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub8(char volatile *_Subend, char _Value) {
- return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
}
static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub16(short volatile *_Subend, short _Value) {
- return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
}
static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub(long volatile *_Subend, long _Value) {
- return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
}
-#ifdef __x86_64__
static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value) {
- return __atomic_fetch_sub(_Subend, _Value, __ATOMIC_SEQ_CST);
+_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) {
+ return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Increment
\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedIncrement16(short volatile *_Value) {
- return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedIncrement16_acq(short volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_nf(short volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_rel(short volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_acq(long volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_nf(long volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_rel(long volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
}
-#ifdef __x86_64__
static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedIncrement64(__int64 volatile *_Value) {
- return __atomic_add_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedIncrement64_acq(__int64 volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_nf(__int64 volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_rel(__int64 volatile *_Value) {
+ return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Decrement
\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedDecrement16(short volatile *_Value) {
- return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedDecrement16_acq(short volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_nf(short volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_rel(short volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_acq(long volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_nf(long volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_rel(long volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_acq(__int64 volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
}
-#ifdef __x86_64__
static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedDecrement64(__int64 volatile *_Value) {
- return __atomic_sub_fetch(_Value, 1, __ATOMIC_SEQ_CST);
+_InterlockedDecrement64_nf(__int64 volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_rel(__int64 volatile *_Value) {
+ return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked And
\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedAnd8(char volatile *_Value, char _Mask) {
- return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd8_acq(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_nf(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_rel(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_acq(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_nf(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
}
static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedAnd16(short volatile *_Value, short _Mask) {
- return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd16_rel(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
}
static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedAnd(long volatile *_Value, long _Mask) {
- return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd_acq(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_nf(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_rel(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
}
-#ifdef __x86_64__
static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask) {
- return __atomic_fetch_and(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Or
\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_acq(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_nf(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedOr8(char volatile *_Value, char _Mask) {
- return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr8_rel(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_acq(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_nf(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
}
static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedOr16(short volatile *_Value, short _Mask) {
- return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr16_rel(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
}
static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedOr(long volatile *_Value, long _Mask) {
- return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr_acq(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_nf(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_rel(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
}
-#ifdef __x86_64__
static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedOr64(__int64 volatile *_Value, __int64 _Mask) {
- return __atomic_fetch_or(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Xor
\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_acq(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_nf(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedXor8(char volatile *_Value, char _Mask) {
- return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor8_rel(char volatile *_Value, char _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_acq(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_nf(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
}
static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedXor16(short volatile *_Value, short _Mask) {
- return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor16_rel(short volatile *_Value, short _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
}
static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedXor(long volatile *_Value, long _Mask) {
- return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor_acq(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_nf(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_rel(long volatile *_Value, long _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
}
-#ifdef __x86_64__
static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedXor64(__int64 volatile *_Value, __int64 _Mask) {
- return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_SEQ_CST);
+_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) {
+ return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Exchange
\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedExchange8(char volatile *_Target, char _Value) {
- __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange8_acq(char volatile *_Target, char _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+ return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_nf(char volatile *_Target, char _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+ return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_rel(char volatile *_Target, char _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
return _Value;
}
static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedExchange16(short volatile *_Target, short _Value) {
- __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange16_acq(short volatile *_Target, short _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+ return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_nf(short volatile *_Target, short _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+ return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_rel(short volatile *_Target, short _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+ return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_acq(long volatile *_Target, long _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+ return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_nf(long volatile *_Target, long _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+ return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_rel(long volatile *_Target, long _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+ return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+ return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
return _Value;
}
-#ifdef __x86_64__
static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchange64(__int64 volatile *_Target, __int64 _Value) {
- __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_SEQ_CST);
+_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) {
+ __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
return _Value;
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Compare Exchange
\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
static __inline__ char __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange8(char volatile *_Destination,
+_InterlockedCompareExchange8_acq(char volatile *_Destination,
char _Exchange, char _Comparand) {
__atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
- __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+ __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+ return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_nf(char volatile *_Destination,
+ char _Exchange, char _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+ return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_rel(char volatile *_Destination,
+ char _Exchange, char _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
return _Comparand;
}
static __inline__ short __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange16(short volatile *_Destination,
+_InterlockedCompareExchange16_acq(short volatile *_Destination,
short _Exchange, short _Comparand) {
__atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
- __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+ __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
return _Comparand;
}
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange64(__int64 volatile *_Destination,
- __int64 _Exchange, __int64 _Comparand) {
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_nf(short volatile *_Destination,
+ short _Exchange, short _Comparand) {
__atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
- __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+ __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
return _Comparand;
}
-/*----------------------------------------------------------------------------*\
-|* Barriers
-\*----------------------------------------------------------------------------*/
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadWriteBarrier(void) {
- __atomic_signal_fence(__ATOMIC_SEQ_CST);
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_rel(short volatile *_Destination,
+ short _Exchange, short _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+ return _Comparand;
}
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadBarrier(void) {
- __atomic_signal_fence(__ATOMIC_SEQ_CST);
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_acq(long volatile *_Destination,
+ long _Exchange, long _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+ return _Comparand;
}
-static __inline__ void __DEFAULT_FN_ATTRS
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_WriteBarrier(void) {
- __atomic_signal_fence(__ATOMIC_SEQ_CST);
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_nf(long volatile *_Destination,
+ long _Exchange, long _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+ return _Comparand;
}
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__faststorefence(void) {
- __atomic_thread_fence(__ATOMIC_SEQ_CST);
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_rel(long volatile *_Destination,
+ long _Exchange, long _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+ return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
+ __int64 _Exchange, __int64 _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+ return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+ __int64 _Exchange, __int64 _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+ return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+ __int64 _Exchange, __int64 _Comparand) {
+ __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+ return _Comparand;
}
#endif
/*----------------------------------------------------------------------------*\
@@ -840,59 +1010,39 @@ __readgsqword(unsigned long __offset) {
#if defined(__i386__) || defined(__x86_64__)
static __inline__ void __DEFAULT_FN_ATTRS
__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
- __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)
- : "%edi", "%esi", "%ecx");
+ __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n));
}
static __inline__ void __DEFAULT_FN_ATTRS
__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
- __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)
- : "%edi", "%esi", "%ecx");
+ __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n));
}
static __inline__ void __DEFAULT_FN_ATTRS
__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
- __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)
- : "%edi", "%esi", "%ecx");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosb(unsigned char *__dst, unsigned char __x, size_t __n) {
- __asm__("rep stosb" : : "D"(__dst), "a"(__x), "c"(__n)
- : "%edi", "%ecx");
+ __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n));
}
static __inline__ void __DEFAULT_FN_ATTRS
__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
- __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)
- : "%edi", "%ecx");
+ __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n));
}
static __inline__ void __DEFAULT_FN_ATTRS
__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
- __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)
- : "%edi", "%ecx");
+ __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n));
}
#endif
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
- __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)
- : "%edi", "%esi", "%ecx");
+ __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n));
}
static __inline__ void __DEFAULT_FN_ATTRS
__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
- __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)
- : "%edi", "%ecx");
+ __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n));
}
#endif
/*----------------------------------------------------------------------------*\
|* Misc
\*----------------------------------------------------------------------------*/
-static __inline__ void * __DEFAULT_FN_ATTRS
-_AddressOfReturnAddress(void) {
- return (void*)((char*)__builtin_frame_address(0) + sizeof(void*));
-}
-static __inline__ void * __DEFAULT_FN_ATTRS
-_ReturnAddress(void) {
- return __builtin_return_address(0);
-}
#if defined(__i386__) || defined(__x86_64__)
static __inline__ void __DEFAULT_FN_ATTRS
__cpuid(int __info[4], int __level) {
@@ -914,6 +1064,10 @@ static __inline__ void __DEFAULT_FN_ATTRS
__halt(void) {
__asm__ volatile ("hlt");
}
+static __inline__ void __DEFAULT_FN_ATTRS
+__nop(void) {
+ __asm__ volatile ("nop");
+}
#endif
/*----------------------------------------------------------------------------*\
diff --git a/lib/Headers/lzcntintrin.h b/lib/Headers/lzcntintrin.h
index 4c00e42ac3a9..3d2769da3bae 100644
--- a/lib/Headers/lzcntintrin.h
+++ b/lib/Headers/lzcntintrin.h
@@ -31,18 +31,48 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 16-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of leading zero
+/// bits in the operand.
static __inline__ unsigned short __DEFAULT_FN_ATTRS
__lzcnt16(unsigned short __X)
{
return __X ? __builtin_clzs(__X) : 16;
}
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+/// bits in the operand.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__lzcnt32(unsigned int __X)
{
return __X ? __builtin_clz(__X) : 32;
}
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+/// bits in the operand.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_lzcnt_u32(unsigned int __X)
{
@@ -50,12 +80,32 @@ _lzcnt_u32(unsigned int __X)
}
#ifdef __x86_64__
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+/// bits in the operand.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__lzcnt64(unsigned long long __X)
{
return __X ? __builtin_clzll(__X) : 64;
}
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+/// bits in the operand.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_lzcnt_u64(unsigned long long __X)
{
diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h
index cefd6053aa80..e0c277a65a33 100644
--- a/lib/Headers/mmintrin.h
+++ b/lib/Headers/mmintrin.h
@@ -39,7 +39,7 @@ typedef char __v8qi __attribute__((__vector_size__(8)));
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c EMMS instruction.
+/// This intrinsic corresponds to the <c> EMMS </c> instruction.
///
static __inline__ void __DEFAULT_FN_ATTRS
_mm_empty(void)
@@ -52,7 +52,7 @@ _mm_empty(void)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
///
/// \param __i
/// A 32-bit integer value.
@@ -69,7 +69,7 @@ _mm_cvtsi32_si64(int __i)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector.
@@ -85,7 +85,7 @@ _mm_cvtsi64_si32(__m64 __m)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
///
/// \param __i
/// A 64-bit signed integer.
@@ -101,7 +101,7 @@ _mm_cvtsi64_m64(long long __i)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVQ / MOVD instruction.
+/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector.
@@ -121,7 +121,7 @@ _mm_cvtm64_si64(__m64 __m)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PACKSSWB instruction.
+/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
@@ -151,7 +151,7 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PACKSSDW instruction.
+/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
@@ -181,7 +181,7 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PACKUSWB instruction.
+/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
@@ -208,19 +208,19 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PUNPCKHBW instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
///
/// \param __m1
-/// A 64-bit integer vector of [8 x i8].
-/// Bits [39:32] are written to bits [7:0] of the result.
-/// Bits [47:40] are written to bits [23:16] of the result.
-/// Bits [55:48] are written to bits [39:32] of the result.
+/// A 64-bit integer vector of [8 x i8]. \n
+/// Bits [39:32] are written to bits [7:0] of the result. \n
+/// Bits [47:40] are written to bits [23:16] of the result. \n
+/// Bits [55:48] are written to bits [39:32] of the result. \n
/// Bits [63:56] are written to bits [55:48] of the result.
/// \param __m2
/// A 64-bit integer vector of [8 x i8].
-/// Bits [39:32] are written to bits [15:8] of the result.
-/// Bits [47:40] are written to bits [31:24] of the result.
-/// Bits [55:48] are written to bits [47:40] of the result.
+/// Bits [39:32] are written to bits [15:8] of the result. \n
+/// Bits [47:40] are written to bits [31:24] of the result. \n
+/// Bits [55:48] are written to bits [47:40] of the result. \n
/// Bits [63:56] are written to bits [63:56] of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
/// values.
@@ -235,15 +235,15 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PUNPCKHWD instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
-/// Bits [47:32] are written to bits [15:0] of the result.
+/// Bits [47:32] are written to bits [15:0] of the result. \n
/// Bits [63:48] are written to bits [47:32] of the result.
/// \param __m2
/// A 64-bit integer vector of [4 x i16].
-/// Bits [47:32] are written to bits [31:16] of the result.
+/// Bits [47:32] are written to bits [31:16] of the result. \n
/// Bits [63:48] are written to bits [63:48] of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
/// values.
@@ -258,7 +258,7 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PUNPCKHDQ instruction.
+/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
@@ -279,19 +279,19 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PUNPCKLBW instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8].
-/// Bits [7:0] are written to bits [7:0] of the result.
-/// Bits [15:8] are written to bits [23:16] of the result.
-/// Bits [23:16] are written to bits [39:32] of the result.
+/// Bits [7:0] are written to bits [7:0] of the result. \n
+/// Bits [15:8] are written to bits [23:16] of the result. \n
+/// Bits [23:16] are written to bits [39:32] of the result. \n
/// Bits [31:24] are written to bits [55:48] of the result.
/// \param __m2
/// A 64-bit integer vector of [8 x i8].
-/// Bits [7:0] are written to bits [15:8] of the result.
-/// Bits [15:8] are written to bits [31:24] of the result.
-/// Bits [23:16] are written to bits [47:40] of the result.
+/// Bits [7:0] are written to bits [15:8] of the result. \n
+/// Bits [15:8] are written to bits [31:24] of the result. \n
+/// Bits [23:16] are written to bits [47:40] of the result. \n
/// Bits [31:24] are written to bits [63:56] of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
/// values.
@@ -306,15 +306,15 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PUNPCKLWD instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
-/// Bits [15:0] are written to bits [15:0] of the result.
+/// Bits [15:0] are written to bits [15:0] of the result. \n
/// Bits [31:16] are written to bits [47:32] of the result.
/// \param __m2
/// A 64-bit integer vector of [4 x i16].
-/// Bits [15:0] are written to bits [31:16] of the result.
+/// Bits [15:0] are written to bits [31:16] of the result. \n
/// Bits [31:16] are written to bits [63:48] of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
/// values.
@@ -329,7 +329,7 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PUNPCKLDQ instruction.
+/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
@@ -352,7 +352,7 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PADDB instruction.
+/// This intrinsic corresponds to the <c> PADDB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8].
@@ -373,7 +373,7 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PADDW instruction.
+/// This intrinsic corresponds to the <c> PADDW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
@@ -394,7 +394,7 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PADDD instruction.
+/// This intrinsic corresponds to the <c> PADDD </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [2 x i32].
@@ -416,7 +416,7 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PADDSB instruction.
+/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8].
@@ -439,7 +439,7 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PADDSW instruction.
+/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
@@ -461,7 +461,7 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PADDUSB instruction.
+/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8].
@@ -483,7 +483,7 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PADDUSW instruction.
+/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
@@ -504,7 +504,7 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSUBB instruction.
+/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -525,7 +525,7 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSUBW instruction.
+/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -546,7 +546,7 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSUBD instruction.
+/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [2 x i32] containing the minuends.
@@ -569,7 +569,7 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSUBSB instruction.
+/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -592,7 +592,7 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSUBSW instruction.
+/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -615,7 +615,7 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSUBUSB instruction.
+/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8] containing the minuends.
@@ -638,7 +638,7 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSUBUSW instruction.
+/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16] containing the minuends.
@@ -663,7 +663,7 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMADDWD instruction.
+/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
@@ -684,7 +684,7 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMULHW instruction.
+/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
@@ -705,7 +705,7 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMULLW instruction.
+/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
@@ -727,14 +727,15 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSLLW instruction.
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [4 x i16].
/// \param __count
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-/// values. If __count is greater or equal to 16, the result is set to all 0.
+/// values. If \a __count is greater or equal to 16, the result is set to all
+/// 0.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sll_pi16(__m64 __m, __m64 __count)
{
@@ -748,14 +749,15 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSLLW instruction.
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [4 x i16].
/// \param __count
/// A 32-bit integer value.
/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-/// values. If __count is greater or equal to 16, the result is set to all 0.
+/// values. If \a __count is greater or equal to 16, the result is set to all
+/// 0.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_slli_pi16(__m64 __m, int __count)
{
@@ -770,14 +772,15 @@ _mm_slli_pi16(__m64 __m, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSLLD instruction.
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [2 x i32].
/// \param __count
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-/// values. If __count is greater or equal to 32, the result is set to all 0.
+/// values. If \a __count is greater or equal to 32, the result is set to all
+/// 0.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sll_pi32(__m64 __m, __m64 __count)
{
@@ -791,14 +794,15 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSLLD instruction.
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [2 x i32].
/// \param __count
/// A 32-bit integer value.
/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-/// values. If __count is greater or equal to 32, the result is set to all 0.
+/// values. If \a __count is greater or equal to 32, the result is set to all
+/// 0.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_slli_pi32(__m64 __m, int __count)
{
@@ -811,14 +815,14 @@ _mm_slli_pi32(__m64 __m, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSLLQ instruction.
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
///
/// \param __m
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \param __count
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector containing the left-shifted value. If
-/// __count is greater or equal to 64, the result is set to 0.
+/// \a __count is greater or equal to 64, the result is set to 0.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sll_si64(__m64 __m, __m64 __count)
{
@@ -831,14 +835,14 @@ _mm_sll_si64(__m64 __m, __m64 __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSLLQ instruction.
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
///
/// \param __m
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \param __count
/// A 32-bit integer value.
/// \returns A 64-bit integer vector containing the left-shifted value. If
-/// __count is greater or equal to 64, the result is set to 0.
+/// \a __count is greater or equal to 64, the result is set to 0.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_slli_si64(__m64 __m, int __count)
{
@@ -854,7 +858,7 @@ _mm_slli_si64(__m64 __m, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRAW instruction.
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [4 x i16].
@@ -876,7 +880,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRAW instruction.
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [4 x i16].
@@ -899,7 +903,7 @@ _mm_srai_pi16(__m64 __m, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRAD instruction.
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [2 x i32].
@@ -921,7 +925,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRAD instruction.
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [2 x i32].
@@ -943,7 +947,7 @@ _mm_srai_pi32(__m64 __m, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRLW instruction.
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [4 x i16].
@@ -964,7 +968,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRLW instruction.
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [4 x i16].
@@ -986,7 +990,7 @@ _mm_srli_pi16(__m64 __m, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRLD instruction.
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [2 x i32].
@@ -1007,7 +1011,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRLD instruction.
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector of [2 x i32].
@@ -1027,7 +1031,7 @@ _mm_srli_pi32(__m64 __m, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRLQ instruction.
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
///
/// \param __m
/// A 64-bit integer vector interpreted as a single 64-bit integer.
@@ -1046,7 +1050,7 @@ _mm_srl_si64(__m64 __m, __m64 __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSRLQ instruction.
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
///
/// \param __m
/// A 64-bit integer vector interpreted as a single 64-bit integer.
@@ -1063,7 +1067,7 @@ _mm_srli_si64(__m64 __m, int __count)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PAND instruction.
+/// This intrinsic corresponds to the <c> PAND </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector.
@@ -1083,7 +1087,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PANDN instruction.
+/// This intrinsic corresponds to the <c> PANDN </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector. The one's complement of this parameter is used
@@ -1102,7 +1106,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c POR instruction.
+/// This intrinsic corresponds to the <c> POR </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector.
@@ -1120,7 +1124,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PXOR instruction.
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector.
@@ -1141,7 +1145,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PCMPEQB instruction.
+/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8].
@@ -1162,7 +1166,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PCMPEQW instruction.
+/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
@@ -1183,7 +1187,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PCMPEQD instruction.
+/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [2 x i32].
@@ -1204,7 +1208,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PCMPGTB instruction.
+/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8].
@@ -1225,7 +1229,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PCMPGTW instruction.
+/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [4 x i16].
@@ -1246,7 +1250,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PCMPGTD instruction.
+/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [2 x i32].
@@ -1264,7 +1268,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction.
///
/// \returns An initialized 64-bit integer vector with all elements set to zero.
static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -1356,7 +1360,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction.
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
///
/// \param __i
/// A 32-bit integer value used to initialize each vector element of the
@@ -1374,7 +1378,7 @@ _mm_set1_pi32(int __i)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction.
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
///
/// \param __w
/// A 16-bit integer value used to initialize each vector element of the
@@ -1391,8 +1395,8 @@ _mm_set1_pi16(short __w)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPUNPCKLBW + VPSHUFLW / \c PUNPCKLBW +
-/// PSHUFLW instruction.
+/// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW +
+/// PSHUFLW </c> instruction.
///
/// \param __b
/// An 8-bit integer value used to initialize each vector element of the
diff --git a/lib/Headers/module.modulemap b/lib/Headers/module.modulemap
index 3e40d2c08d8c..11ef2f902945 100644
--- a/lib/Headers/module.modulemap
+++ b/lib/Headers/module.modulemap
@@ -63,11 +63,13 @@ module _Builtin_intrinsics [system] [extern_c] {
textual header "mwaitxintrin.h"
explicit module mm_malloc {
+ requires !freestanding
header "mm_malloc.h"
export * // note: for <stdlib.h> dependency
}
explicit module cpuid {
+ requires gnuinlineasm
header "cpuid.h"
}
diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h
index 802927490e7f..0c25d312709d 100644
--- a/lib/Headers/opencl-c.h
+++ b/lib/Headers/opencl-c.h
@@ -17,6 +17,7 @@
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define __ovld __attribute__((overloadable))
+#define __conv __attribute__((convergent))
// Optimizations
#define __purefn __attribute__((pure))
@@ -9810,14 +9811,6 @@ float3 __ovld __cnfn native_cos(float3 x);
float4 __ovld __cnfn native_cos(float4 x);
float8 __ovld __cnfn native_cos(float8 x);
float16 __ovld __cnfn native_cos(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_cos(double x);
-double2 __ovld __cnfn native_cos(double2 x);
-double3 __ovld __cnfn native_cos(double3 x);
-double4 __ovld __cnfn native_cos(double4 x);
-double8 __ovld __cnfn native_cos(double8 x);
-double16 __ovld __cnfn native_cos(double16 x);
-#endif //cl_khr_fp64
/**
* Compute x / y over an implementation-defined range.
@@ -9829,14 +9822,6 @@ float3 __ovld __cnfn native_divide(float3 x, float3 y);
float4 __ovld __cnfn native_divide(float4 x, float4 y);
float8 __ovld __cnfn native_divide(float8 x, float8 y);
float16 __ovld __cnfn native_divide(float16 x, float16 y);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_divide(double x, double y);
-double2 __ovld __cnfn native_divide(double2 x, double2 y);
-double3 __ovld __cnfn native_divide(double3 x, double3 y);
-double4 __ovld __cnfn native_divide(double4 x, double4 y);
-double8 __ovld __cnfn native_divide(double8 x, double8 y);
-double16 __ovld __cnfn native_divide(double16 x, double16 y);
-#endif //cl_khr_fp64
/**
* Compute the base- e exponential of x over an
@@ -9849,14 +9834,6 @@ float3 __ovld __cnfn native_exp(float3 x);
float4 __ovld __cnfn native_exp(float4 x);
float8 __ovld __cnfn native_exp(float8 x);
float16 __ovld __cnfn native_exp(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp(double x);
-double2 __ovld __cnfn native_exp(double2 x);
-double3 __ovld __cnfn native_exp(double3 x);
-double4 __ovld __cnfn native_exp(double4 x);
-double8 __ovld __cnfn native_exp(double8 x);
-double16 __ovld __cnfn native_exp(double16 x);
-#endif //cl_khr_fp64
/**
* Compute the base- 2 exponential of x over an
@@ -9869,14 +9846,6 @@ float3 __ovld __cnfn native_exp2(float3 x);
float4 __ovld __cnfn native_exp2(float4 x);
float8 __ovld __cnfn native_exp2(float8 x);
float16 __ovld __cnfn native_exp2(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp2(double x);
-double2 __ovld __cnfn native_exp2(double2 x);
-double3 __ovld __cnfn native_exp2(double3 x);
-double4 __ovld __cnfn native_exp2(double4 x);
-double8 __ovld __cnfn native_exp2(double8 x);
-double16 __ovld __cnfn native_exp2(double16 x);
-#endif //cl_khr_fp64
/**
* Compute the base- 10 exponential of x over an
@@ -9889,14 +9858,6 @@ float3 __ovld __cnfn native_exp10(float3 x);
float4 __ovld __cnfn native_exp10(float4 x);
float8 __ovld __cnfn native_exp10(float8 x);
float16 __ovld __cnfn native_exp10(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_exp10(double x);
-double2 __ovld __cnfn native_exp10(double2 x);
-double3 __ovld __cnfn native_exp10(double3 x);
-double4 __ovld __cnfn native_exp10(double4 x);
-double8 __ovld __cnfn native_exp10(double8 x);
-double16 __ovld __cnfn native_exp10(double16 x);
-#endif //cl_khr_fp64
/**
* Compute natural logarithm over an implementationdefined
@@ -9909,14 +9870,6 @@ float3 __ovld __cnfn native_log(float3 x);
float4 __ovld __cnfn native_log(float4 x);
float8 __ovld __cnfn native_log(float8 x);
float16 __ovld __cnfn native_log(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log(double x);
-double2 __ovld __cnfn native_log(double2 x);
-double3 __ovld __cnfn native_log(double3 x);
-double4 __ovld __cnfn native_log(double4 x);
-double8 __ovld __cnfn native_log(double8 x);
-double16 __ovld __cnfn native_log(double16 x);
-#endif //cl_khr_fp64
/**
* Compute a base 2 logarithm over an implementationdefined
@@ -9928,14 +9881,6 @@ float3 __ovld __cnfn native_log2(float3 x);
float4 __ovld __cnfn native_log2(float4 x);
float8 __ovld __cnfn native_log2(float8 x);
float16 __ovld __cnfn native_log2(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log2(double x);
-double2 __ovld __cnfn native_log2(double2 x);
-double3 __ovld __cnfn native_log2(double3 x);
-double4 __ovld __cnfn native_log2(double4 x);
-double8 __ovld __cnfn native_log2(double8 x);
-double16 __ovld __cnfn native_log2(double16 x);
-#endif //cl_khr_fp64
/**
* Compute a base 10 logarithm over an implementationdefined
@@ -9947,14 +9892,6 @@ float3 __ovld __cnfn native_log10(float3 x);
float4 __ovld __cnfn native_log10(float4 x);
float8 __ovld __cnfn native_log10(float8 x);
float16 __ovld __cnfn native_log10(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_log10(double x);
-double2 __ovld __cnfn native_log10(double2 x);
-double3 __ovld __cnfn native_log10(double3 x);
-double4 __ovld __cnfn native_log10(double4 x);
-double8 __ovld __cnfn native_log10(double8 x);
-double16 __ovld __cnfn native_log10(double16 x);
-#endif //cl_khr_fp64
/**
* Compute x to the power y, where x is >= 0. The range of
@@ -9967,14 +9904,6 @@ float3 __ovld __cnfn native_powr(float3 x, float3 y);
float4 __ovld __cnfn native_powr(float4 x, float4 y);
float8 __ovld __cnfn native_powr(float8 x, float8 y);
float16 __ovld __cnfn native_powr(float16 x, float16 y);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_powr(double x, double y);
-double2 __ovld __cnfn native_powr(double2 x, double2 y);
-double3 __ovld __cnfn native_powr(double3 x, double3 y);
-double4 __ovld __cnfn native_powr(double4 x, double4 y);
-double8 __ovld __cnfn native_powr(double8 x, double8 y);
-double16 __ovld __cnfn native_powr(double16 x, double16 y);
-#endif //cl_khr_fp64
/**
* Compute reciprocal over an implementation-defined
@@ -9986,14 +9915,6 @@ float3 __ovld __cnfn native_recip(float3 x);
float4 __ovld __cnfn native_recip(float4 x);
float8 __ovld __cnfn native_recip(float8 x);
float16 __ovld __cnfn native_recip(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_recip(double x);
-double2 __ovld __cnfn native_recip(double2 x);
-double3 __ovld __cnfn native_recip(double3 x);
-double4 __ovld __cnfn native_recip(double4 x);
-double8 __ovld __cnfn native_recip(double8 x);
-double16 __ovld __cnfn native_recip(double16 x);
-#endif //cl_khr_fp64
/**
* Compute inverse square root over an implementationdefined
@@ -10005,14 +9926,6 @@ float3 __ovld __cnfn native_rsqrt(float3 x);
float4 __ovld __cnfn native_rsqrt(float4 x);
float8 __ovld __cnfn native_rsqrt(float8 x);
float16 __ovld __cnfn native_rsqrt(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_rsqrt(double x);
-double2 __ovld __cnfn native_rsqrt(double2 x);
-double3 __ovld __cnfn native_rsqrt(double3 x);
-double4 __ovld __cnfn native_rsqrt(double4 x);
-double8 __ovld __cnfn native_rsqrt(double8 x);
-double16 __ovld __cnfn native_rsqrt(double16 x);
-#endif //cl_khr_fp64
/**
* Compute sine over an implementation-defined range.
@@ -10024,14 +9937,6 @@ float3 __ovld __cnfn native_sin(float3 x);
float4 __ovld __cnfn native_sin(float4 x);
float8 __ovld __cnfn native_sin(float8 x);
float16 __ovld __cnfn native_sin(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_sin(double x);
-double2 __ovld __cnfn native_sin(double2 x);
-double3 __ovld __cnfn native_sin(double3 x);
-double4 __ovld __cnfn native_sin(double4 x);
-double8 __ovld __cnfn native_sin(double8 x);
-double16 __ovld __cnfn native_sin(double16 x);
-#endif //cl_khr_fp64
/**
* Compute square root over an implementation-defined
@@ -10043,14 +9948,6 @@ float3 __ovld __cnfn native_sqrt(float3 x);
float4 __ovld __cnfn native_sqrt(float4 x);
float8 __ovld __cnfn native_sqrt(float8 x);
float16 __ovld __cnfn native_sqrt(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_sqrt(double x);
-double2 __ovld __cnfn native_sqrt(double2 x);
-double3 __ovld __cnfn native_sqrt(double3 x);
-double4 __ovld __cnfn native_sqrt(double4 x);
-double8 __ovld __cnfn native_sqrt(double8 x);
-double16 __ovld __cnfn native_sqrt(double16 x);
-#endif //cl_khr_fp64
/**
* Compute tangent over an implementation-defined range.
@@ -10062,14 +9959,6 @@ float3 __ovld __cnfn native_tan(float3 x);
float4 __ovld __cnfn native_tan(float4 x);
float8 __ovld __cnfn native_tan(float8 x);
float16 __ovld __cnfn native_tan(float16 x);
-#ifdef cl_khr_fp64
-double __ovld __cnfn native_tan(double x);
-double2 __ovld __cnfn native_tan(double2 x);
-double3 __ovld __cnfn native_tan(double3 x);
-double4 __ovld __cnfn native_tan(double4 x);
-double8 __ovld __cnfn native_tan(double8 x);
-double16 __ovld __cnfn native_tan(double16 x);
-#endif //cl_khr_fp64
// OpenCL v1.1 s6.11.3, v1.2 s6.12.3, v2.0 s6.13.3 - Integer Functions
@@ -13934,7 +13823,7 @@ typedef uint cl_mem_fence_flags;
* image objects and then want to read the updated data.
*/
-void __ovld barrier(cl_mem_fence_flags flags);
+void __ovld __conv barrier(cl_mem_fence_flags flags);
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -13947,8 +13836,8 @@ typedef enum memory_scope
memory_scope_sub_group
} memory_scope;
-void __ovld work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
-void __ovld work_group_barrier(cl_mem_fence_flags flags);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
// OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
@@ -14728,6 +14617,13 @@ int __ovld atom_xor(volatile __local int *p, int val);
unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val);
#endif
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_xor(volatile __global long *p, long val);
+unsigned long __ovld atom_xor(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_xor(volatile __local long *p, long val);
+unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long val);
+#endif
+
#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
@@ -15564,9 +15460,11 @@ half16 __ovld __cnfn shuffle2(half8 x, half8 y, ushort16 mask);
half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
#endif //cl_khr_fp16
+#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
int printf(__constant const char* st, ...);
+#endif
// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
@@ -15592,6 +15490,10 @@ int printf(__constant const char* st, ...);
#define CLK_FILTER_NEAREST 0x10
#define CLK_FILTER_LINEAR 0x20
+#ifdef cl_khr_gl_msaa_sharing
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
+#endif //cl_khr_gl_msaa_sharing
+
/**
* Use the coordinate (coord.xy) to do an element lookup in
* the 2D image object specified by image.
@@ -16493,6 +16395,7 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept
#define CLK_sRGBA 0x10C1
#define CLK_sRGBx 0x10C0
#define CLK_sBGRA 0x10C2
+#define CLK_ABGR 0x10C3
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
@@ -16670,101 +16573,101 @@ int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
// OpenCL v2.0 s6.13.15 - Work-group Functions
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-int __ovld work_group_all(int predicate);
-int __ovld work_group_any(int predicate);
+int __ovld __conv work_group_all(int predicate);
+int __ovld __conv work_group_any(int predicate);
#ifdef cl_khr_fp16
-half __ovld work_group_broadcast(half a, size_t local_id);
-half __ovld work_group_broadcast(half a, size_t x, size_t y);
-half __ovld work_group_broadcast(half a, size_t x, size_t y, size_t z);
+half __ovld __conv work_group_broadcast(half a, size_t local_id);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y, size_t z);
#endif
-int __ovld work_group_broadcast(int a, size_t local_id);
-int __ovld work_group_broadcast(int a, size_t x, size_t y);
-int __ovld work_group_broadcast(int a, size_t x, size_t y, size_t z);
-uint __ovld work_group_broadcast(uint a, size_t local_id);
-uint __ovld work_group_broadcast(uint a, size_t x, size_t y);
-uint __ovld work_group_broadcast(uint a, size_t x, size_t y, size_t z);
-long __ovld work_group_broadcast(long a, size_t local_id);
-long __ovld work_group_broadcast(long a, size_t x, size_t y);
-long __ovld work_group_broadcast(long a, size_t x, size_t y, size_t z);
-ulong __ovld work_group_broadcast(ulong a, size_t local_id);
-ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y);
-ulong __ovld work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
-float __ovld work_group_broadcast(float a, size_t local_id);
-float __ovld work_group_broadcast(float a, size_t x, size_t y);
-float __ovld work_group_broadcast(float a, size_t x, size_t y, size_t z);
+int __ovld __conv work_group_broadcast(int a, size_t local_id);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y, size_t z);
+uint __ovld __conv work_group_broadcast(uint a, size_t local_id);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y, size_t z);
+long __ovld __conv work_group_broadcast(long a, size_t local_id);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y, size_t z);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t local_id);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
+float __ovld __conv work_group_broadcast(float a, size_t local_id);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z);
#ifdef cl_khr_fp64
-double __ovld work_group_broadcast(double a, size_t local_id);
-double __ovld work_group_broadcast(double a, size_t x, size_t y);
-double __ovld work_group_broadcast(double a, size_t x, size_t y, size_t z);
+double __ovld __conv work_group_broadcast(double a, size_t local_id);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y, size_t z);
#endif //cl_khr_fp64
#ifdef cl_khr_fp16
-half __ovld work_group_reduce_add(half x);
-half __ovld work_group_reduce_min(half x);
-half __ovld work_group_reduce_max(half x);
-half __ovld work_group_scan_exclusive_add(half x);
-half __ovld work_group_scan_exclusive_min(half x);
-half __ovld work_group_scan_exclusive_max(half x);
-half __ovld work_group_scan_inclusive_add(half x);
-half __ovld work_group_scan_inclusive_min(half x);
-half __ovld work_group_scan_inclusive_max(half x);
+half __ovld __conv work_group_reduce_add(half x);
+half __ovld __conv work_group_reduce_min(half x);
+half __ovld __conv work_group_reduce_max(half x);
+half __ovld __conv work_group_scan_exclusive_add(half x);
+half __ovld __conv work_group_scan_exclusive_min(half x);
+half __ovld __conv work_group_scan_exclusive_max(half x);
+half __ovld __conv work_group_scan_inclusive_add(half x);
+half __ovld __conv work_group_scan_inclusive_min(half x);
+half __ovld __conv work_group_scan_inclusive_max(half x);
#endif
-int __ovld work_group_reduce_add(int x);
-int __ovld work_group_reduce_min(int x);
-int __ovld work_group_reduce_max(int x);
-int __ovld work_group_scan_exclusive_add(int x);
-int __ovld work_group_scan_exclusive_min(int x);
-int __ovld work_group_scan_exclusive_max(int x);
-int __ovld work_group_scan_inclusive_add(int x);
-int __ovld work_group_scan_inclusive_min(int x);
-int __ovld work_group_scan_inclusive_max(int x);
-uint __ovld work_group_reduce_add(uint x);
-uint __ovld work_group_reduce_min(uint x);
-uint __ovld work_group_reduce_max(uint x);
-uint __ovld work_group_scan_exclusive_add(uint x);
-uint __ovld work_group_scan_exclusive_min(uint x);
-uint __ovld work_group_scan_exclusive_max(uint x);
-uint __ovld work_group_scan_inclusive_add(uint x);
-uint __ovld work_group_scan_inclusive_min(uint x);
-uint __ovld work_group_scan_inclusive_max(uint x);
-long __ovld work_group_reduce_add(long x);
-long __ovld work_group_reduce_min(long x);
-long __ovld work_group_reduce_max(long x);
-long __ovld work_group_scan_exclusive_add(long x);
-long __ovld work_group_scan_exclusive_min(long x);
-long __ovld work_group_scan_exclusive_max(long x);
-long __ovld work_group_scan_inclusive_add(long x);
-long __ovld work_group_scan_inclusive_min(long x);
-long __ovld work_group_scan_inclusive_max(long x);
-ulong __ovld work_group_reduce_add(ulong x);
-ulong __ovld work_group_reduce_min(ulong x);
-ulong __ovld work_group_reduce_max(ulong x);
-ulong __ovld work_group_scan_exclusive_add(ulong x);
-ulong __ovld work_group_scan_exclusive_min(ulong x);
-ulong __ovld work_group_scan_exclusive_max(ulong x);
-ulong __ovld work_group_scan_inclusive_add(ulong x);
-ulong __ovld work_group_scan_inclusive_min(ulong x);
-ulong __ovld work_group_scan_inclusive_max(ulong x);
-float __ovld work_group_reduce_add(float x);
-float __ovld work_group_reduce_min(float x);
-float __ovld work_group_reduce_max(float x);
-float __ovld work_group_scan_exclusive_add(float x);
-float __ovld work_group_scan_exclusive_min(float x);
-float __ovld work_group_scan_exclusive_max(float x);
-float __ovld work_group_scan_inclusive_add(float x);
-float __ovld work_group_scan_inclusive_min(float x);
-float __ovld work_group_scan_inclusive_max(float x);
+int __ovld __conv work_group_reduce_add(int x);
+int __ovld __conv work_group_reduce_min(int x);
+int __ovld __conv work_group_reduce_max(int x);
+int __ovld __conv work_group_scan_exclusive_add(int x);
+int __ovld __conv work_group_scan_exclusive_min(int x);
+int __ovld __conv work_group_scan_exclusive_max(int x);
+int __ovld __conv work_group_scan_inclusive_add(int x);
+int __ovld __conv work_group_scan_inclusive_min(int x);
+int __ovld __conv work_group_scan_inclusive_max(int x);
+uint __ovld __conv work_group_reduce_add(uint x);
+uint __ovld __conv work_group_reduce_min(uint x);
+uint __ovld __conv work_group_reduce_max(uint x);
+uint __ovld __conv work_group_scan_exclusive_add(uint x);
+uint __ovld __conv work_group_scan_exclusive_min(uint x);
+uint __ovld __conv work_group_scan_exclusive_max(uint x);
+uint __ovld __conv work_group_scan_inclusive_add(uint x);
+uint __ovld __conv work_group_scan_inclusive_min(uint x);
+uint __ovld __conv work_group_scan_inclusive_max(uint x);
+long __ovld __conv work_group_reduce_add(long x);
+long __ovld __conv work_group_reduce_min(long x);
+long __ovld __conv work_group_reduce_max(long x);
+long __ovld __conv work_group_scan_exclusive_add(long x);
+long __ovld __conv work_group_scan_exclusive_min(long x);
+long __ovld __conv work_group_scan_exclusive_max(long x);
+long __ovld __conv work_group_scan_inclusive_add(long x);
+long __ovld __conv work_group_scan_inclusive_min(long x);
+long __ovld __conv work_group_scan_inclusive_max(long x);
+ulong __ovld __conv work_group_reduce_add(ulong x);
+ulong __ovld __conv work_group_reduce_min(ulong x);
+ulong __ovld __conv work_group_reduce_max(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_max(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_max(ulong x);
+float __ovld __conv work_group_reduce_add(float x);
+float __ovld __conv work_group_reduce_min(float x);
+float __ovld __conv work_group_reduce_max(float x);
+float __ovld __conv work_group_scan_exclusive_add(float x);
+float __ovld __conv work_group_scan_exclusive_min(float x);
+float __ovld __conv work_group_scan_exclusive_max(float x);
+float __ovld __conv work_group_scan_inclusive_add(float x);
+float __ovld __conv work_group_scan_inclusive_min(float x);
+float __ovld __conv work_group_scan_inclusive_max(float x);
#ifdef cl_khr_fp64
-double __ovld work_group_reduce_add(double x);
-double __ovld work_group_reduce_min(double x);
-double __ovld work_group_reduce_max(double x);
-double __ovld work_group_scan_exclusive_add(double x);
-double __ovld work_group_scan_exclusive_min(double x);
-double __ovld work_group_scan_exclusive_max(double x);
-double __ovld work_group_scan_inclusive_add(double x);
-double __ovld work_group_scan_inclusive_min(double x);
-double __ovld work_group_scan_inclusive_max(double x);
+double __ovld __conv work_group_reduce_add(double x);
+double __ovld __conv work_group_reduce_min(double x);
+double __ovld __conv work_group_reduce_max(double x);
+double __ovld __conv work_group_scan_exclusive_add(double x);
+double __ovld __conv work_group_scan_exclusive_min(double x);
+double __ovld __conv work_group_scan_exclusive_max(double x);
+double __ovld __conv work_group_scan_inclusive_add(double x);
+double __ovld __conv work_group_scan_inclusive_min(double x);
+double __ovld __conv work_group_scan_inclusive_max(double x);
#endif //cl_khr_fp64
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -16840,11 +16743,11 @@ void __ovld retain_event(clk_event_t);
void __ovld release_event(clk_event_t);
-clk_event_t create_user_event(void);
+clk_event_t __ovld create_user_event(void);
void __ovld set_user_event_status(clk_event_t e, int state);
-bool is_valid_event (clk_event_t event);
+bool __ovld is_valid_event (clk_event_t event);
void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);
@@ -16864,96 +16767,286 @@ uint __ovld get_enqueued_num_sub_groups(void);
uint __ovld get_sub_group_id(void);
uint __ovld get_sub_group_local_id(void);
-void __ovld sub_group_barrier(cl_mem_fence_flags flags);
+void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-void __ovld sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
-int __ovld sub_group_all(int predicate);
-int __ovld sub_group_any(int predicate);
-
-int __ovld sub_group_broadcast(int x, uint sub_group_local_id);
-uint __ovld sub_group_broadcast(uint x, uint sub_group_local_id);
-long __ovld sub_group_broadcast(long x, uint sub_group_local_id);
-ulong __ovld sub_group_broadcast(ulong x, uint sub_group_local_id);
-float __ovld sub_group_broadcast(float x, uint sub_group_local_id);
-
-int __ovld sub_group_reduce_add(int x);
-uint __ovld sub_group_reduce_add(uint x);
-long __ovld sub_group_reduce_add(long x);
-ulong __ovld sub_group_reduce_add(ulong x);
-float __ovld sub_group_reduce_add(float x);
-int __ovld sub_group_reduce_min(int x);
-uint __ovld sub_group_reduce_min(uint x);
-long __ovld sub_group_reduce_min(long x);
-ulong __ovld sub_group_reduce_min(ulong x);
-float __ovld sub_group_reduce_min(float x);
-int __ovld sub_group_reduce_max(int x);
-uint __ovld sub_group_reduce_max(uint x);
-long __ovld sub_group_reduce_max(long x);
-ulong __ovld sub_group_reduce_max(ulong x);
-float __ovld sub_group_reduce_max(float x);
-
-int __ovld sub_group_scan_exclusive_add(int x);
-uint __ovld sub_group_scan_exclusive_add(uint x);
-long __ovld sub_group_scan_exclusive_add(long x);
-ulong __ovld sub_group_scan_exclusive_add(ulong x);
-float __ovld sub_group_scan_exclusive_add(float x);
-int __ovld sub_group_scan_exclusive_min(int x);
-uint __ovld sub_group_scan_exclusive_min(uint x);
-long __ovld sub_group_scan_exclusive_min(long x);
-ulong __ovld sub_group_scan_exclusive_min(ulong x);
-float __ovld sub_group_scan_exclusive_min(float x);
-int __ovld sub_group_scan_exclusive_max(int x);
-uint __ovld sub_group_scan_exclusive_max(uint x);
-long __ovld sub_group_scan_exclusive_max(long x);
-ulong __ovld sub_group_scan_exclusive_max(ulong x);
-float __ovld sub_group_scan_exclusive_max(float x);
-
-int __ovld sub_group_scan_inclusive_add(int x);
-uint __ovld sub_group_scan_inclusive_add(uint x);
-long __ovld sub_group_scan_inclusive_add(long x);
-ulong __ovld sub_group_scan_inclusive_add(ulong x);
-float __ovld sub_group_scan_inclusive_add(float x);
-int __ovld sub_group_scan_inclusive_min(int x);
-uint __ovld sub_group_scan_inclusive_min(uint x);
-long __ovld sub_group_scan_inclusive_min(long x);
-ulong __ovld sub_group_scan_inclusive_min(ulong x);
-float __ovld sub_group_scan_inclusive_min(float x);
-int __ovld sub_group_scan_inclusive_max(int x);
-uint __ovld sub_group_scan_inclusive_max(uint x);
-long __ovld sub_group_scan_inclusive_max(long x);
-ulong __ovld sub_group_scan_inclusive_max(ulong x);
-float __ovld sub_group_scan_inclusive_max(float x);
+int __ovld __conv sub_group_all(int predicate);
+int __ovld __conv sub_group_any(int predicate);
+
+int __ovld __conv sub_group_broadcast(int x, uint sub_group_local_id);
+uint __ovld __conv sub_group_broadcast(uint x, uint sub_group_local_id);
+long __ovld __conv sub_group_broadcast(long x, uint sub_group_local_id);
+ulong __ovld __conv sub_group_broadcast(ulong x, uint sub_group_local_id);
+float __ovld __conv sub_group_broadcast(float x, uint sub_group_local_id);
+
+int __ovld __conv sub_group_reduce_add(int x);
+uint __ovld __conv sub_group_reduce_add(uint x);
+long __ovld __conv sub_group_reduce_add(long x);
+ulong __ovld __conv sub_group_reduce_add(ulong x);
+float __ovld __conv sub_group_reduce_add(float x);
+int __ovld __conv sub_group_reduce_min(int x);
+uint __ovld __conv sub_group_reduce_min(uint x);
+long __ovld __conv sub_group_reduce_min(long x);
+ulong __ovld __conv sub_group_reduce_min(ulong x);
+float __ovld __conv sub_group_reduce_min(float x);
+int __ovld __conv sub_group_reduce_max(int x);
+uint __ovld __conv sub_group_reduce_max(uint x);
+long __ovld __conv sub_group_reduce_max(long x);
+ulong __ovld __conv sub_group_reduce_max(ulong x);
+float __ovld __conv sub_group_reduce_max(float x);
+
+int __ovld __conv sub_group_scan_exclusive_add(int x);
+uint __ovld __conv sub_group_scan_exclusive_add(uint x);
+long __ovld __conv sub_group_scan_exclusive_add(long x);
+ulong __ovld __conv sub_group_scan_exclusive_add(ulong x);
+float __ovld __conv sub_group_scan_exclusive_add(float x);
+int __ovld __conv sub_group_scan_exclusive_min(int x);
+uint __ovld __conv sub_group_scan_exclusive_min(uint x);
+long __ovld __conv sub_group_scan_exclusive_min(long x);
+ulong __ovld __conv sub_group_scan_exclusive_min(ulong x);
+float __ovld __conv sub_group_scan_exclusive_min(float x);
+int __ovld __conv sub_group_scan_exclusive_max(int x);
+uint __ovld __conv sub_group_scan_exclusive_max(uint x);
+long __ovld __conv sub_group_scan_exclusive_max(long x);
+ulong __ovld __conv sub_group_scan_exclusive_max(ulong x);
+float __ovld __conv sub_group_scan_exclusive_max(float x);
+
+int __ovld __conv sub_group_scan_inclusive_add(int x);
+uint __ovld __conv sub_group_scan_inclusive_add(uint x);
+long __ovld __conv sub_group_scan_inclusive_add(long x);
+ulong __ovld __conv sub_group_scan_inclusive_add(ulong x);
+float __ovld __conv sub_group_scan_inclusive_add(float x);
+int __ovld __conv sub_group_scan_inclusive_min(int x);
+uint __ovld __conv sub_group_scan_inclusive_min(uint x);
+long __ovld __conv sub_group_scan_inclusive_min(long x);
+ulong __ovld __conv sub_group_scan_inclusive_min(ulong x);
+float __ovld __conv sub_group_scan_inclusive_min(float x);
+int __ovld __conv sub_group_scan_inclusive_max(int x);
+uint __ovld __conv sub_group_scan_inclusive_max(uint x);
+long __ovld __conv sub_group_scan_inclusive_max(long x);
+ulong __ovld __conv sub_group_scan_inclusive_max(ulong x);
+float __ovld __conv sub_group_scan_inclusive_max(float x);
#ifdef cl_khr_fp16
-half __ovld sub_group_broadcast(half x, uint sub_group_local_id);
-half __ovld sub_group_reduce_add(half x);
-half __ovld sub_group_reduce_min(half x);
-half __ovld sub_group_reduce_max(half x);
-half __ovld sub_group_scan_exclusive_add(half x);
-half __ovld sub_group_scan_exclusive_min(half x);
-half __ovld sub_group_scan_exclusive_max(half x);
-half __ovld sub_group_scan_inclusive_add(half x);
-half __ovld sub_group_scan_inclusive_min(half x);
-half __ovld sub_group_scan_inclusive_max(half x);
+half __ovld __conv sub_group_broadcast(half x, uint sub_group_local_id);
+half __ovld __conv sub_group_reduce_add(half x);
+half __ovld __conv sub_group_reduce_min(half x);
+half __ovld __conv sub_group_reduce_max(half x);
+half __ovld __conv sub_group_scan_exclusive_add(half x);
+half __ovld __conv sub_group_scan_exclusive_min(half x);
+half __ovld __conv sub_group_scan_exclusive_max(half x);
+half __ovld __conv sub_group_scan_inclusive_add(half x);
+half __ovld __conv sub_group_scan_inclusive_min(half x);
+half __ovld __conv sub_group_scan_inclusive_max(half x);
#endif //cl_khr_fp16
#ifdef cl_khr_fp64
-double __ovld sub_group_broadcast(double x, uint sub_group_local_id);
-double __ovld sub_group_reduce_add(double x);
-double __ovld sub_group_reduce_min(double x);
-double __ovld sub_group_reduce_max(double x);
-double __ovld sub_group_scan_exclusive_add(double x);
-double __ovld sub_group_scan_exclusive_min(double x);
-double __ovld sub_group_scan_exclusive_max(double x);
-double __ovld sub_group_scan_inclusive_add(double x);
-double __ovld sub_group_scan_inclusive_min(double x);
-double __ovld sub_group_scan_inclusive_max(double x);
+double __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id);
+double __ovld __conv sub_group_reduce_add(double x);
+double __ovld __conv sub_group_reduce_min(double x);
+double __ovld __conv sub_group_reduce_max(double x);
+double __ovld __conv sub_group_scan_exclusive_add(double x);
+double __ovld __conv sub_group_scan_exclusive_min(double x);
+double __ovld __conv sub_group_scan_exclusive_max(double x);
+double __ovld __conv sub_group_scan_inclusive_add(double x);
+double __ovld __conv sub_group_scan_inclusive_min(double x);
+double __ovld __conv sub_group_scan_inclusive_max(double x);
#endif //cl_khr_fp64
#endif //cl_khr_subgroups cl_intel_subgroups
+#ifdef cl_amd_media_ops
+uint __ovld amd_bitalign(uint a, uint b, uint c);
+uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bitalign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bitalign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bitalign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bitalign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_bytealign(uint a, uint b, uint c);
+uint2 __ovld amd_bytealign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bytealign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bytealign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bytealign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bytealign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_lerp(uint a, uint b, uint c);
+uint2 __ovld amd_lerp(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_lerp(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_lerp(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_lerp(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_lerp(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_pack(float4 v);
+
+uint __ovld amd_sad4(uint4 x, uint4 y, uint z);
+
+uint __ovld amd_sadhi(uint a, uint b, uint c);
+uint2 __ovld amd_sadhi(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sadhi(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sadhi(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sadhi(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sadhi(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_sad(uint a, uint b, uint c);
+uint2 __ovld amd_sad(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sad(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sad(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sad(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sad(uint16 a, uint16 b, uint16 c);
+
+float __ovld amd_unpack0(uint a);
+float2 __ovld amd_unpack0(uint2 a);
+float3 __ovld amd_unpack0(uint3 a);
+float4 __ovld amd_unpack0(uint4 a);
+float8 __ovld amd_unpack0(uint8 a);
+float16 __ovld amd_unpack0(uint16 a);
+
+float __ovld amd_unpack1(uint a);
+float2 __ovld amd_unpack1(uint2 a);
+float3 __ovld amd_unpack1(uint3 a);
+float4 __ovld amd_unpack1(uint4 a);
+float8 __ovld amd_unpack1(uint8 a);
+float16 __ovld amd_unpack1(uint16 a);
+
+float __ovld amd_unpack2(uint a);
+float2 __ovld amd_unpack2(uint2 a);
+float3 __ovld amd_unpack2(uint3 a);
+float4 __ovld amd_unpack2(uint4 a);
+float8 __ovld amd_unpack2(uint8 a);
+float16 __ovld amd_unpack2(uint16 a);
+
+float __ovld amd_unpack3(uint a);
+float2 __ovld amd_unpack3(uint2 a);
+float3 __ovld amd_unpack3(uint3 a);
+float4 __ovld amd_unpack3(uint4 a);
+float8 __ovld amd_unpack3(uint8 a);
+float16 __ovld amd_unpack3(uint16 a);
+#endif // cl_amd_media_ops
+
+#ifdef cl_amd_media_ops2
+int __ovld amd_bfe(int src0, uint src1, uint src2);
+int2 __ovld amd_bfe(int2 src0, uint2 src1, uint2 src2);
+int3 __ovld amd_bfe(int3 src0, uint3 src1, uint3 src2);
+int4 __ovld amd_bfe(int4 src0, uint4 src1, uint4 src2);
+int8 __ovld amd_bfe(int8 src0, uint8 src1, uint8 src2);
+int16 __ovld amd_bfe(int16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfe(uint src0, uint src1, uint src2);
+uint2 __ovld amd_bfe(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_bfe(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_bfe(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_bfe(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_bfe(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfm(uint src0, uint src1);
+uint2 __ovld amd_bfm(uint2 src0, uint2 src1);
+uint3 __ovld amd_bfm(uint3 src0, uint3 src1);
+uint4 __ovld amd_bfm(uint4 src0, uint4 src1);
+uint8 __ovld amd_bfm(uint8 src0, uint8 src1);
+uint16 __ovld amd_bfm(uint16 src0, uint16 src1);
+
+float __ovld amd_max3(float src0, float src1, float src2);
+float2 __ovld amd_max3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_max3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_max3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_max3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_max3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_max3(int src0, int src1, int src2);
+int2 __ovld amd_max3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_max3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_max3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_max3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_max3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_max3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_max3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_max3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_max3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_max3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_max3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_median3(float src0, float src1, float src2);
+float2 __ovld amd_median3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_median3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_median3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_median3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_median3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_median3(int src0, int src1, int src2);
+int2 __ovld amd_median3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_median3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_median3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_median3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_median3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_median3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_median3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_median3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_median3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_median3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_median3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_min3(float src0, float src1, float src);
+float2 __ovld amd_min3(float2 src0, float2 src1, float2 src);
+float3 __ovld amd_min3(float3 src0, float3 src1, float3 src);
+float4 __ovld amd_min3(float4 src0, float4 src1, float4 src);
+float8 __ovld amd_min3(float8 src0, float8 src1, float8 src);
+float16 __ovld amd_min3(float16 src0, float16 src1, float16 src);
+
+int __ovld amd_min3(int src0, int src1, int src2);
+int2 __ovld amd_min3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_min3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_min3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_min3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_min3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_min3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_min3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_min3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_min3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_min3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_min3(uint16 src0, uint16 src1, uint16 src2);
+
+ulong __ovld amd_mqsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_mqsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_mqsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_mqsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_mqsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_mqsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+ulong __ovld amd_qsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_qsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_qsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_qsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_qsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_qsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+uint __ovld amd_msad(uint src0, uint src1, uint src2);
+uint2 __ovld amd_msad(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_msad(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_msad(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_msad(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_msad(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadd(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadd(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadd(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadd(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadd(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadd(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadw(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadw(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadw(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadw(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
+#endif // cl_amd_media_ops2
+
// Disable any extensions we may have enabled previously.
#pragma OPENCL EXTENSION all : disable
diff --git a/lib/Headers/pmmintrin.h b/lib/Headers/pmmintrin.h
index 5b1058069c44..d4f6487af179 100644
--- a/lib/Headers/pmmintrin.h
+++ b/lib/Headers/pmmintrin.h
@@ -37,7 +37,7 @@
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VLDDQU instruction.
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
///
/// \param __p
/// A pointer to a 128-bit integer vector containing integer values.
@@ -53,7 +53,7 @@ _mm_lddqu_si128(__m128i const *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VADDSUBPS instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the left source operand.
@@ -72,7 +72,7 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VHADDPS instruction.
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
@@ -95,7 +95,7 @@ _mm_hadd_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VHSUBPS instruction.
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
@@ -115,18 +115,18 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
/// vector of [4 x float] to float values stored in a 128-bit vector of
-/// [4 x float].
-/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
-/// the destination.
-/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
-/// destination.
+/// [4 x float].
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVSHDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
///
/// \param __a
-/// A 128-bit vector of [4 x float].
+/// A 128-bit vector of [4 x float]. \n
+/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+/// the destination. \n
+/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+/// destination.
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
/// values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -135,20 +135,19 @@ _mm_movehdup_ps(__m128 __a)
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
}
-/// \brief Duplicates low-order (even-indexed) values from a 128-bit
-/// vector of [4 x float] to float values stored in a 128-bit vector of
-/// [4 x float].
-/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
-/// the destination.
-/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
-/// destination.
+/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
+/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVSLDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
///
/// \param __a
-/// A 128-bit vector of [4 x float].
+/// A 128-bit vector of [4 x float] \n
+/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+/// the destination. \n
+/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+/// destination.
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
/// values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -162,7 +161,7 @@ _mm_moveldup_ps(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VADDSUBPD instruction.
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing the left source operand.
@@ -181,7 +180,7 @@ _mm_addsub_pd(__m128d __a, __m128d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VHADDPD instruction.
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing one of the source operands.
@@ -204,7 +203,7 @@ _mm_hadd_pd(__m128d __a, __m128d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VHSUBPD instruction.
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing one of the source operands.
@@ -231,7 +230,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
/// __m128d _mm_loaddup_pd(double const * dp);
/// \endcode
///
-/// This intrinsic corresponds to the \c VMOVDDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
///
/// \param dp
/// A pointer to a double-precision value to be moved and duplicated.
@@ -245,7 +244,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVDDUP instruction.
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
@@ -272,7 +271,7 @@ _mm_movedup_pd(__m128d __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c MONITOR instruction.
+/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
///
/// \param __p
/// The memory range to be monitored. The size of the range is determined by
@@ -293,7 +292,7 @@ _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c MWAIT instruction.
+/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
///
/// \param __extensions
/// Optional extensions for the monitoring state, which may vary by
diff --git a/lib/Headers/popcntintrin.h b/lib/Headers/popcntintrin.h
index 7e2f1670805f..0b4793e58bcb 100644
--- a/lib/Headers/popcntintrin.h
+++ b/lib/Headers/popcntintrin.h
@@ -31,7 +31,7 @@
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// An unsigned 32-bit integer operand.
@@ -47,7 +47,7 @@ _mm_popcnt_u32(unsigned int __A)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// A signed 32-bit integer operand.
@@ -64,7 +64,7 @@ _popcnt32(int __A)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// An unsigned 64-bit integer operand.
@@ -80,7 +80,7 @@ _mm_popcnt_u64(unsigned long long __A)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c POPCNT instruction.
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// A signed 64-bit integer operand.
diff --git a/lib/Headers/stdatomic.h b/lib/Headers/stdatomic.h
index e03798766014..23bb3a357768 100644
--- a/lib/Headers/stdatomic.h
+++ b/lib/Headers/stdatomic.h
@@ -45,11 +45,11 @@ extern "C" {
#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-#define ATOMIC_SHORT_T_LOCK_FREE __GCC_ATOMIC_SHORT_T_LOCK_FREE
-#define ATOMIC_INT_T_LOCK_FREE __GCC_ATOMIC_INT_T_LOCK_FREE
-#define ATOMIC_LONG_T_LOCK_FREE __GCC_ATOMIC_LONG_T_LOCK_FREE
-#define ATOMIC_LLONG_T_LOCK_FREE __GCC_ATOMIC_LLONG_T_LOCK_FREE
-#define ATOMIC_POINTER_T_LOCK_FREE __GCC_ATOMIC_POINTER_T_LOCK_FREE
+#define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE
+#define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
+#define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE
+#define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE
+#define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE
/* 7.17.2 Initialization */
diff --git a/lib/Headers/tmmintrin.h b/lib/Headers/tmmintrin.h
index a72796ba4a68..80664043a06f 100644
--- a/lib/Headers/tmmintrin.h
+++ b/lib/Headers/tmmintrin.h
@@ -483,15 +483,15 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
/// \param __b
/// A 128-bit integer vector containing the second source operand.
/// \returns A 128-bit integer vector containing the sums of products of both
-/// operands:
-/// R0 := (__a0 * __b0) + (__a1 * __b1)
-/// R1 := (__a2 * __b2) + (__a3 * __b3)
-/// R2 := (__a4 * __b4) + (__a5 * __b5)
-/// R3 := (__a6 * __b6) + (__a7 * __b7)
-/// R4 := (__a8 * __b8) + (__a9 * __b9)
-/// R5 := (__a10 * __b10) + (__a11 * __b11)
-/// R6 := (__a12 * __b12) + (__a13 * __b13)
-/// R7 := (__a14 * __b14) + (__a15 * __b15)
+/// operands: \n
+/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
+/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
+/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
+/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
+/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maddubs_epi16(__m128i __a, __m128i __b)
{
@@ -516,11 +516,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
/// \param __b
/// A 64-bit integer vector containing the second source operand.
/// \returns A 64-bit integer vector containing the sums of products of both
-/// operands:
-/// R0 := (__a0 * __b0) + (__a1 * __b1)
-/// R1 := (__a2 * __b2) + (__a3 * __b3)
-/// R2 := (__a4 * __b4) + (__a5 * __b5)
-/// R3 := (__a6 * __b6) + (__a7 * __b7)
+/// operands: \n
+/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_maddubs_pi16(__m64 __a, __m64 __b)
{
@@ -580,11 +580,11 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b)
/// \param __b
/// A 128-bit integer vector containing control bytes corresponding to
/// positions in the destination:
-/// Bit 7:
-/// 1: Clear the corresponding byte in the destination.
+/// Bit 7: \n
+/// 1: Clear the corresponding byte in the destination. \n
/// 0: Copy the selected source byte to the corresponding byte in the
-/// destination.
-/// Bits [6:4] Reserved.
+/// destination. \n
+/// Bits [6:4] Reserved. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 128-bit integer vector containing the copied or cleared values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -606,10 +606,10 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
/// \param __b
/// A 64-bit integer vector containing control bytes corresponding to
/// positions in the destination:
-/// Bit 7:
-/// 1: Clear the corresponding byte in the destination.
+/// Bit 7: \n
+/// 1: Clear the corresponding byte in the destination. \n
/// 0: Copy the selected source byte to the corresponding byte in the
-/// destination.
+/// destination. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 64-bit integer vector containing the copied or cleared values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index 99cddb0fac82..dc31b85cfd7c 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -46,7 +46,7 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
+/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
@@ -69,7 +69,7 @@ _mm_add_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
+/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
@@ -88,7 +88,7 @@ _mm_add_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
+/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
@@ -112,7 +112,7 @@ _mm_sub_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
+/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the minuend.
@@ -131,7 +131,7 @@ _mm_sub_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
+/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
@@ -154,7 +154,7 @@ _mm_mul_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
+/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
@@ -173,7 +173,7 @@ _mm_mul_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
+/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
@@ -195,7 +195,7 @@ _mm_div_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
+/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the dividend.
@@ -214,7 +214,7 @@ _mm_div_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
+/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -233,7 +233,7 @@ _mm_sqrt_ss(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
+/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -250,7 +250,7 @@ _mm_sqrt_ps(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
+/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -269,7 +269,7 @@ _mm_rcp_ss(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
+/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -286,7 +286,7 @@ _mm_rcp_ps(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
+/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -306,7 +306,7 @@ _mm_rsqrt_ss(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
+/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -324,7 +324,7 @@ _mm_rsqrt_ps(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
+/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -341,12 +341,12 @@ _mm_min_ss(__m128 __a, __m128 __b)
return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
}
-/// \brief Compares two 128-bit vectors of [4 x float] and returns the
-/// lesser of each pair of values.
+/// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser
+/// of each pair of values.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
+/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands.
@@ -361,12 +361,12 @@ _mm_min_ps(__m128 __a, __m128 __b)
}
/// \brief Compares two 32-bit float values in the low-order bits of both
-/// operands and returns the greater value in the low-order bits of
-/// a vector [4 x float].
+/// operands and returns the greater value in the low-order bits of a 128-bit
+/// vector of [4 x float].
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
+/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -388,7 +388,7 @@ _mm_max_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
+/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands.
@@ -406,7 +406,7 @@ _mm_max_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
+/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
///
/// \param __a
/// A 128-bit vector containing one of the source operands.
@@ -426,7 +426,7 @@ _mm_and_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
+/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the first source operand. The
@@ -446,7 +446,7 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VORPS / ORPS instructions.
+/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
@@ -465,7 +465,7 @@ _mm_or_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
@@ -485,7 +485,7 @@ _mm_xor_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
+/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -506,7 +506,7 @@ _mm_cmpeq_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
+/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -526,7 +526,7 @@ _mm_cmpeq_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -548,7 +548,7 @@ _mm_cmplt_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -569,7 +569,7 @@ _mm_cmplt_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -591,7 +591,7 @@ _mm_cmple_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -611,7 +611,7 @@ _mm_cmple_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -635,7 +635,7 @@ _mm_cmpgt_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -656,7 +656,7 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -680,7 +680,7 @@ _mm_cmpge_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -699,7 +699,8 @@ _mm_cmpge_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -720,7 +721,8 @@ _mm_cmpneq_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -740,7 +742,8 @@ _mm_cmpneq_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -762,7 +765,8 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -783,7 +787,8 @@ _mm_cmpnlt_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -805,7 +810,8 @@ _mm_cmpnle_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -826,7 +832,8 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -850,7 +857,8 @@ _mm_cmpngt_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -871,7 +879,8 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -895,7 +904,8 @@ _mm_cmpnge_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -916,7 +926,8 @@ _mm_cmpnge_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
+/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -938,7 +949,8 @@ _mm_cmpord_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
+/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -959,7 +971,8 @@ _mm_cmpord_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
+/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the operands. The lower
@@ -981,7 +994,8 @@ _mm_cmpunord_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
+/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -999,7 +1013,8 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1020,7 +1035,8 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1041,7 +1057,7 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1062,7 +1078,7 @@ _mm_comile_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1083,7 +1099,7 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1104,7 +1120,7 @@ _mm_comige_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1125,7 +1141,7 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1146,7 +1162,7 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1162,13 +1178,13 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
}
/// \brief Performs an unordered comparison of two 32-bit float values using
-/// the low-order bits of both operands to determine if the first operand
-/// is less than or equal to the second operand and returns the result of
-/// the comparison.
+/// the low-order bits of both operands to determine if the first operand is
+/// less than or equal to the second operand and returns the result of the
+/// comparison.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1184,13 +1200,13 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
}
/// \brief Performs an unordered comparison of two 32-bit float values using
-/// the low-order bits of both operands to determine if the first operand
-/// is greater than the second operand and returns the result of the
+/// the low-order bits of both operands to determine if the first operand is
+/// greater than the second operand and returns the result of the
/// comparison.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1212,7 +1228,7 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1233,7 +1249,7 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1253,7 +1269,8 @@ _mm_ucomineq_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1270,7 +1287,8 @@ _mm_cvtss_si32(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1289,7 +1307,8 @@ _mm_cvt_ss2si(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1308,7 +1327,7 @@ _mm_cvtss_si64(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1324,7 +1343,7 @@ _mm_cvtps_pi32(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1341,7 +1360,8 @@ _mm_cvt_ps2pi(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1359,7 +1379,8 @@ _mm_cvttss_si32(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1371,13 +1392,15 @@ _mm_cvtt_ss2si(__m128 __a)
return _mm_cvttss_si32(__a);
}
+#ifdef __x86_64__
/// \brief Converts a float value contained in the lower 32 bits of a vector of
/// [4 x float] into a 64-bit integer, truncating the result when it is
/// inexact.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1388,6 +1411,7 @@ _mm_cvttss_si64(__m128 __a)
{
return __builtin_ia32_cvttss2si64((__v4sf)__a);
}
+#endif
/// \brief Converts two low-order float values in a 128-bit vector of
/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
@@ -1395,7 +1419,8 @@ _mm_cvttss_si64(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
+/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
+/// instructions.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1412,7 +1437,7 @@ _mm_cvttps_pi32(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTTPS2PI instruction.
+/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1430,7 +1455,7 @@ _mm_cvtt_ps2pi(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1453,7 +1478,7 @@ _mm_cvtsi32_ss(__m128 __a, int __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1477,7 +1502,7 @@ _mm_cvt_si2ss(__m128 __a, int __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1502,7 +1527,7 @@ _mm_cvtsi64_ss(__m128 __a, long long __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPI2PS instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1525,7 +1550,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPI2PS instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
@@ -1546,7 +1571,7 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
@@ -1558,13 +1583,13 @@ _mm_cvtss_f32(__m128 __a)
return __a[0];
}
-/// \brief Loads two packed float values from the address __p into the
+/// \brief Loads two packed float values from the address \a __p into the
/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
/// are copied from the low-order bits of the first operand.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
@@ -1585,13 +1610,13 @@ _mm_loadh_pi(__m128 __a, const __m64 *__p)
return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
}
-/// \brief Loads two packed float values from the address __p into the low-order
-/// bits of a 128-bit vector of [4 x float]. The high-order bits are copied
-/// from the high-order bits of the first operand.
+/// \brief Loads two packed float values from the address \a __p into the
+/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
+/// are copied from the high-order bits of the first operand.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
@@ -1619,7 +1644,7 @@ _mm_loadl_pi(__m128 __a, const __m64 *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
///
/// \param __p
/// A pointer to a 32-bit memory location containing a single-precision
@@ -1642,13 +1667,13 @@ _mm_load_ss(const float *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c>
/// instruction.
///
/// \param __p
/// A pointer to a float value to be loaded and duplicated.
-/// \returns A 128-bit vector of [4 x float] containing the loaded
-/// and duplicated values.
+/// \returns A 128-bit vector of [4 x float] containing the loaded and
+/// duplicated values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load1_ps(const float *__p)
{
@@ -1666,7 +1691,7 @@ _mm_load1_ps(const float *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
///
/// \param __p
/// A pointer to a 128-bit memory location. The address of the memory
@@ -1683,7 +1708,7 @@ _mm_load_ps(const float *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
///
/// \param __p
/// A pointer to a 128-bit memory location. The address of the memory
@@ -1703,7 +1728,7 @@ _mm_loadu_ps(const float *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
/// instruction.
///
/// \param __p
@@ -1725,7 +1750,6 @@ _mm_loadr_ps(const float *__p)
/// This intrinsic has no corresponding instruction.
///
/// \returns A 128-bit vector of [4 x float] containing undefined values.
-
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_undefined_ps(void)
{
@@ -1738,7 +1762,7 @@ _mm_undefined_ps(void)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
///
/// \param __w
/// A single-precision floating-point value used to initialize the lower 32
@@ -1758,7 +1782,7 @@ _mm_set_ss(float __w)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
///
/// \param __w
/// A single-precision floating-point value used to initialize each vector
@@ -1777,7 +1801,7 @@ _mm_set1_ps(float __w)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
///
/// \param __w
/// A single-precision floating-point value used to initialize each vector
@@ -1849,7 +1873,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
///
/// \returns An initialized 128-bit floating-point vector of [4 x float] with
/// all elements set to zero.
@@ -1864,7 +1888,7 @@ _mm_setzero_ps(void)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
+/// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction.
///
/// \param __p
/// A pointer to a 64-bit memory location.
@@ -1881,7 +1905,7 @@ _mm_storeh_pi(__m64 *__p, __m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
///
/// \param __p
/// A pointer to a memory location that will receive the float values.
@@ -1898,7 +1922,7 @@ _mm_storel_pi(__m64 *__p, __m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
///
/// \param __p
/// A pointer to a 32-bit memory location.
@@ -1913,12 +1937,12 @@ _mm_store_ss(float *__p, __m128 __a)
((struct __mm_store_ss_struct*)__p)->__u = __a[0];
}
-/// \brief Stores float values from a 128-bit vector of [4 x float] to an
-/// unaligned memory location.
+/// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory
+/// location.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
///
/// \param __p
/// A pointer to a 128-bit memory location. The address of the memory
@@ -1934,19 +1958,18 @@ _mm_storeu_ps(float *__p, __m128 __a)
((struct __storeu_ps*)__p)->__v = __a;
}
-/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
-/// four contiguous elements in an aligned memory location.
+/// \brief Stores a 128-bit vector of [4 x float] into an aligned memory
+/// location.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
-/// instruction.
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
///
/// \param __p
-/// A pointer to a 128-bit memory location.
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location has to be 16-byte aligned.
/// \param __a
-/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-/// of the four contiguous elements pointed by __p.
+/// A 128-bit vector of [4 x float] containing the values to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps(float *__p, __m128 __a)
{
@@ -1958,14 +1981,14 @@ _mm_store_ps(float *__p, __m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
/// instruction.
///
/// \param __p
/// A pointer to a 128-bit memory location.
/// \param __a
/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-/// of the four contiguous elements pointed by __p.
+/// of the four contiguous elements pointed by \a __p.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_ps(float *__p, __m128 __a)
{
@@ -1973,18 +1996,19 @@ _mm_store1_ps(float *__p, __m128 __a)
_mm_store_ps(__p, __a);
}
-/// \brief Stores float values from a 128-bit vector of [4 x float] to an
-/// aligned memory location.
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+/// four contiguous elements in an aligned memory location.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+/// instruction.
///
/// \param __p
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location has to be 128-bit aligned.
+/// A pointer to a 128-bit memory location.
/// \param __a
-/// A 128-bit vector of [4 x float] containing the values to be stored.
+/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+/// of the four contiguous elements pointed by \a __p.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps1(float *__p, __m128 __a)
{
@@ -1996,7 +2020,7 @@ _mm_store_ps1(float *__p, __m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
/// instruction.
///
/// \param __p
@@ -2029,20 +2053,21 @@ _mm_storer_ps(float *__p, __m128 __a)
/// void _mm_prefetch(const void * a, const int sel);
/// \endcode
///
-/// This intrinsic corresponds to the \c PREFETCHNTA instruction.
+/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
///
/// \param a
/// A pointer to a memory location containing a cache line of data.
/// \param sel
-/// A predefined integer constant specifying the type of prefetch operation:
-/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
-/// The PREFETCHNTA instruction will be generated.
+/// A predefined integer constant specifying the type of prefetch
+/// operation: \n
+/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
+/// PREFETCHNTA instruction will be generated. \n
/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
-/// be generated.
+/// be generated. \n
/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
-/// be generated.
+/// be generated. \n
/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
-/// be generated.
+/// be generated.
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
#endif
@@ -2052,7 +2077,7 @@ _mm_storer_ps(float *__p, __m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c MOVNTQ instruction.
+/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
///
/// \param __p
/// A pointer to an aligned memory location used to store the register value.
@@ -2070,7 +2095,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
///
/// \param __p
/// A pointer to a 128-bit aligned memory location that will receive the
@@ -2083,6 +2108,10 @@ _mm_stream_ps(float *__p, __m128 __a)
__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
}
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
/// \brief Forces strong memory ordering (serialization) between store
/// instructions preceding this instruction and store instructions following
/// this instruction, ensuring the system completes all previous stores
@@ -2090,28 +2119,32 @@ _mm_stream_ps(float *__p, __m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c SFENCE instruction.
+/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
///
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_sfence(void)
-{
- __builtin_ia32_sfence();
-}
+void _mm_sfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
/// returns it, as specified by the immediate integer operand.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
+/// \code
+/// void _mm_extract_pi(__m64 a, int n);
+/// \endcode
///
-/// \param __a
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param a
/// A 64-bit vector of [4 x i16].
-/// \param __n
-/// An immediate integer operand that determines which bits are extracted:
-/// 0: Bits [15:0] are copied to the destination.
-/// 1: Bits [31:16] are copied to the destination.
-/// 2: Bits [47:32] are copied to the destination.
+/// \param n
+/// An immediate integer operand that determines which bits are extracted: \n
+/// 0: Bits [15:0] are copied to the destination. \n
+/// 1: Bits [31:16] are copied to the destination. \n
+/// 2: Bits [47:32] are copied to the destination. \n
/// 3: Bits [63:48] are copied to the destination.
/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
#define _mm_extract_pi16(a, n) __extension__ ({ \
@@ -2119,26 +2152,30 @@ _mm_sfence(void)
/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
-/// specified by the immediate operand __n.
+/// specified by the immediate operand \a n.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
+/// \code
+/// void _mm_insert_pi(__m64 a, int d, int n);
+/// \endcode
///
-/// \param __a
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param a
/// A 64-bit vector of [4 x i16].
-/// \param __d
+/// \param d
/// An integer. The lower 16-bit value from this operand is written to the
-/// destination at the offset specified by operand __n.
-/// \param __n
+/// destination at the offset specified by operand \a n.
+/// \param n
/// An immediate integer operant that determines which the bits to be used
-/// in the destination.
-/// 0: Bits [15:0] are copied to the destination.
-/// 1: Bits [31:16] are copied to the destination.
-/// 2: Bits [47:32] are copied to the destination.
-/// 3: Bits [63:48] are copied to the destination.
+/// in the destination. \n
+/// 0: Bits [15:0] are copied to the destination. \n
+/// 1: Bits [31:16] are copied to the destination. \n
+/// 2: Bits [47:32] are copied to the destination. \n
+/// 3: Bits [63:48] are copied to the destination. \n
/// The remaining bits in the destination are copied from the corresponding
-/// bits in operand __a.
+/// bits in operand \a a.
/// \returns A 64-bit integer vector containing the copied packed data from the
/// operands.
#define _mm_insert_pi16(a, d, n) __extension__ ({ \
@@ -2150,7 +2187,7 @@ _mm_sfence(void)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMAXSW instruction.
+/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing one of the source operands.
@@ -2169,7 +2206,7 @@ _mm_max_pi16(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMAXUB instruction.
+/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing one of the source operands.
@@ -2188,7 +2225,7 @@ _mm_max_pu8(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMINSW instruction.
+/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing one of the source operands.
@@ -2207,7 +2244,7 @@ _mm_min_pi16(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMINUB instruction.
+/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing one of the source operands.
@@ -2226,7 +2263,7 @@ _mm_min_pu8(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMOVMSKB instruction.
+/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values with bits to be extracted.
@@ -2244,7 +2281,7 @@ _mm_movemask_pi8(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PMULHUW instruction.
+/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing one of the source operands.
@@ -2262,27 +2299,31 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSHUFW instruction.
-///
/// \code
/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
/// \endcode
///
+/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
+///
/// \param a
/// A 64-bit integer vector containing the values to be shuffled.
/// \param n
/// An immediate value containing an 8-bit value specifying which elements to
-/// copy from a. The destinations within the 64-bit destination are assigned
-/// values as follows:
-/// Bits [1:0] are used to assign values to bits [15:0] in the destination.
-/// Bits [3:2] are used to assign values to bits [31:16] in the destination.
-/// Bits [5:4] are used to assign values to bits [47:32] in the destination.
-/// Bits [7:6] are used to assign values to bits [63:48] in the destination.
-/// Bit value assignments:
-/// 00: assigned from bits [15:0] of a.
-/// 01: assigned from bits [31:16] of a.
-/// 10: assigned from bits [47:32] of a.
-/// 11: assigned from bits [63:48] of a.
+/// copy from \a a. The destinations within the 64-bit destination are
+/// assigned values as follows: \n
+/// Bits [1:0] are used to assign values to bits [15:0] in the
+/// destination. \n
+/// Bits [3:2] are used to assign values to bits [31:16] in the
+/// destination. \n
+/// Bits [5:4] are used to assign values to bits [47:32] in the
+/// destination. \n
+/// Bits [7:6] are used to assign values to bits [63:48] in the
+/// destination. \n
+/// Bit value assignments: \n
+/// 00: assigned from bits [15:0] of \a a. \n
+/// 01: assigned from bits [31:16] of \a a. \n
+/// 10: assigned from bits [47:32] of \a a. \n
+/// 11: assigned from bits [63:48] of \a a.
/// \returns A 64-bit integer vector containing the shuffled values.
#define _mm_shuffle_pi16(a, n) __extension__ ({ \
(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
@@ -2295,15 +2336,15 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c MASKMOVQ instruction.
+/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
///
/// \param __d
/// A 64-bit integer vector containing the values with elements to be copied.
/// \param __n
/// A 64-bit integer vector operand. The most significant bit from each 8-bit
-/// element determines whether the corresponding element in operand __d is
-/// copied. If the most significant bit of a given element is 1, the
-/// corresponding element in operand __d is copied.
+/// element determines whether the corresponding element in operand \a __d
+/// is copied. If the most significant bit of a given element is 1, the
+/// corresponding element in operand \a __d is copied.
/// \param __p
/// A pointer to a 64-bit memory location that will receive the conditionally
/// copied integer values. The address of the memory location does not have
@@ -2320,7 +2361,7 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PAVGB instruction.
+/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing one of the source operands.
@@ -2339,7 +2380,7 @@ _mm_avg_pu8(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PAVGW instruction.
+/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing one of the source operands.
@@ -2359,7 +2400,7 @@ _mm_avg_pu16(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c PSADBW instruction.
+/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
///
/// \param __a
/// A 64-bit integer vector containing one of the source operands.
@@ -2374,24 +2415,42 @@ _mm_sad_pu8(__m64 __a, __m64 __b)
return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
}
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
-/// integer value. There are several groups of macros associated with this
+/// integer value.
+///
+/// There are several groups of macros associated with this
/// intrinsic, including:
-/// * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+/// <ul>
+/// <li>
+/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
/// _MM_GET_EXCEPTION_STATE().
-/// * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+/// </li>
+/// <li>
+/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
-/// * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+/// </li>
+/// <li>
+/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
/// _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
-/// * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+/// </li>
+/// <li>
+/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
-/// * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+/// </li>
+/// <li>
+/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
/// _MM_GET_DENORMALS_ZERO_MODE().
+/// </li>
+/// </ul>
///
/// For example, the expression below checks if an overflow exception has
/// occurred:
@@ -2402,35 +2461,45 @@ _mm_sad_pu8(__m64 __a, __m64 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
+/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
///
/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
/// register.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_getcsr(void)
-{
- return __builtin_ia32_stmxcsr();
-}
-
-/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
-/// are several groups of macros associated with this intrinsic, including:
-/// * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+unsigned int _mm_getcsr(void);
+
+/// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
+///
+/// There are several groups of macros associated with this intrinsic,
+/// including:
+/// <ul>
+/// <li>
+/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
-/// * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+/// </li>
+/// <li>
+/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
/// of these macros.
-/// * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+/// </li>
+/// <li>
+/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
-/// * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+/// </li>
+/// <li>
+/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
/// one of these macros.
-/// * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+/// </li>
+/// <li>
+/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
+/// </li>
+/// </ul>
///
/// For example, the following expression causes subsequent floating-point
/// operations to round up:
@@ -2444,15 +2513,15 @@ _mm_getcsr(void)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
+/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
///
/// \param __i
/// A 32-bit unsigned integer value to be written to the MXCSR register.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_setcsr(unsigned int __i)
-{
- __builtin_ia32_ldmxcsr(__i);
-}
+void _mm_setcsr(unsigned int);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
/// specified by the immediate value operand.
@@ -2463,7 +2532,7 @@ _mm_setcsr(unsigned int __i)
/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
/// \endcode
///
-/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
+/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
///
/// \param a
/// A 128-bit vector of [4 x float].
@@ -2471,18 +2540,23 @@ _mm_setcsr(unsigned int __i)
/// A 128-bit vector of [4 x float].
/// \param mask
/// An immediate value containing an 8-bit value specifying which elements to
-/// copy from a and b.
-/// Bits [3:0] specify the values copied from operand a.
-/// Bits [7:4] specify the values copied from operand b. The destinations
-/// within the 128-bit destination are assigned values as follows:
-/// Bits [1:0] are used to assign values to bits [31:0] in the destination.
-/// Bits [3:2] are used to assign values to bits [63:32] in the destination.
-/// Bits [5:4] are used to assign values to bits [95:64] in the destination.
-/// Bits [7:6] are used to assign values to bits [127:96] in the destination.
-/// Bit value assignments:
-/// 00: Bits [31:0] copied from the specified operand.
-/// 01: Bits [63:32] copied from the specified operand.
-/// 10: Bits [95:64] copied from the specified operand.
+/// copy from \ a and \a b. \n
+/// Bits [3:0] specify the values copied from operand \a a. \n
+/// Bits [7:4] specify the values copied from operand \a b. \n
+/// The destinations within the 128-bit destination are assigned values as
+/// follows: \n
+/// Bits [1:0] are used to assign values to bits [31:0] in the
+/// destination. \n
+/// Bits [3:2] are used to assign values to bits [63:32] in the
+/// destination. \n
+/// Bits [5:4] are used to assign values to bits [95:64] in the
+/// destination. \n
+/// Bits [7:6] are used to assign values to bits [127:96] in the
+/// destination. \n
+/// Bit value assignments: \n
+/// 00: Bits [31:0] copied from the specified operand. \n
+/// 01: Bits [63:32] copied from the specified operand. \n
+/// 10: Bits [95:64] copied from the specified operand. \n
/// 11: Bits [127:96] copied from the specified operand.
/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
@@ -2493,20 +2567,19 @@ _mm_setcsr(unsigned int __i)
4 + (((mask) >> 6) & 0x3)); })
/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
-/// [4 x float] and interleaves them into a 128-bit vector of [4 x
-/// float].
+/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
+/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
///
/// \param __a
-/// A 128-bit vector of [4 x float].
-/// Bits [95:64] are written to bits [31:0] of the destination.
+/// A 128-bit vector of [4 x float]. \n
+/// Bits [95:64] are written to bits [31:0] of the destination. \n
/// Bits [127:96] are written to bits [95:64] of the destination.
/// \param __b
/// A 128-bit vector of [4 x float].
-/// Bits [95:64] are written to bits [63:32] of the destination.
+/// Bits [95:64] are written to bits [63:32] of the destination. \n
/// Bits [127:96] are written to bits [127:96] of the destination.
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2516,20 +2589,19 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
}
/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
-/// [4 x float] and interleaves them into a 128-bit vector of [4 x
-/// float].
+/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
+/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
///
/// \param __a
-/// A 128-bit vector of [4 x float].
-/// Bits [31:0] are written to bits [31:0] of the destination.
+/// A 128-bit vector of [4 x float]. \n
+/// Bits [31:0] are written to bits [31:0] of the destination. \n
/// Bits [63:32] are written to bits [95:64] of the destination.
/// \param __b
-/// A 128-bit vector of [4 x float].
-/// Bits [31:0] are written to bits [63:32] of the destination.
+/// A 128-bit vector of [4 x float]. \n
+/// Bits [31:0] are written to bits [63:32] of the destination. \n
/// Bits [63:32] are written to bits [127:96] of the destination.
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -2544,7 +2616,7 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
///
/// \param __a
/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
@@ -2565,7 +2637,7 @@ _mm_move_ss(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
///
/// \param __a
/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
@@ -2586,7 +2658,7 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
///
/// \param __a
/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
@@ -2606,7 +2678,8 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
@@ -2636,7 +2709,8 @@ _mm_cvtpi16_ps(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
@@ -2665,7 +2739,8 @@ _mm_cvtpu16_ps(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
@@ -2689,7 +2764,8 @@ _mm_cvtpi8_ps(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
@@ -2713,7 +2789,8 @@ _mm_cvtpu8_ps(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
@@ -2741,12 +2818,13 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
/// packs the results into a 64-bit integer vector of [4 x i16]. If the
/// floating-point element is NaN or infinity, or if the floating-point
/// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
-/// to 0x8000. Otherwise if the floating-point element is greater
-/// than 0x7FFF, it is converted to 0x7FFF.
+/// to 0x8000. Otherwise if the floating-point element is greater than
+/// 0x7FFF, it is converted to 0x7FFF.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// A 128-bit floating-point vector of [4 x float].
@@ -2770,12 +2848,13 @@ _mm_cvtps_pi16(__m128 __a)
/// [8 x i8]. The upper 32 bits of the vector are set to 0. If the
/// floating-point element is NaN or infinity, or if the floating-point
/// element is greater than 0x7FFFFFFF or less than -0x80, it is converted
-/// to 0x80. Otherwise if the floating-point element is greater
-/// than 0x7F, it is converted to 0x7F.
+/// to 0x80. Otherwise if the floating-point element is greater than 0x7F,
+/// it is converted to 0x7F.
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
+/// instruction.
///
/// \param __a
/// 128-bit floating-point vector of [4 x float].
@@ -2799,7 +2878,7 @@ _mm_cvtps_pi8(__m128 __a)
///
/// \headerfile <x86intrin.h>
///
-/// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
+/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
///
/// \param __a
/// A 128-bit floating-point vector of [4 x float].