diff options
Diffstat (limited to 'lib/Headers/arm_neon.h')
-rw-r--r-- | lib/Headers/arm_neon.h | 537 |
1 files changed, 0 insertions, 537 deletions
diff --git a/lib/Headers/arm_neon.h b/lib/Headers/arm_neon.h deleted file mode 100644 index 4508a27f36a45..0000000000000 --- a/lib/Headers/arm_neon.h +++ /dev/null @@ -1,537 +0,0 @@ -/*===---- arm_neon.h - NEON intrinsics --------------------------------------=== - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __ARM_NEON_H -#define __ARM_NEON_H - -#ifndef __ARM_NEON__ -#error "NEON support not enabled" -#endif - -// NEON document appears to be specified in terms of stdint types. -#include <stdint.h> - -// Define some NEON-specific scalar types for floats and polynomials. -typedef float float32_t; -typedef uint8_t poly8_t; - -// FIXME: probably need a 'poly' attribute or something for correct codegen to -// disambiguate from uint16_t. -typedef uint16_t poly16_t; - -typedef __attribute__(( __vector_size__(8) )) int8_t __neon_int8x8_t; -typedef __attribute__(( __vector_size__(16) )) int8_t __neon_int8x16_t; -typedef __attribute__(( __vector_size__(8) )) int16_t __neon_int16x4_t; -typedef __attribute__(( __vector_size__(16) )) int16_t __neon_int16x8_t; -typedef __attribute__(( __vector_size__(8) )) int32_t __neon_int32x2_t; -typedef __attribute__(( __vector_size__(16) )) int32_t __neon_int32x4_t; -typedef __attribute__(( __vector_size__(8) )) int64_t __neon_int64x1_t; -typedef __attribute__(( __vector_size__(16) )) int64_t __neon_int64x2_t; -typedef __attribute__(( __vector_size__(8) )) uint8_t __neon_uint8x8_t; -typedef __attribute__(( __vector_size__(16) )) uint8_t __neon_uint8x16_t; -typedef __attribute__(( __vector_size__(8) )) uint16_t __neon_uint16x4_t; -typedef __attribute__(( __vector_size__(16) )) uint16_t __neon_uint16x8_t; -typedef __attribute__(( __vector_size__(8) )) uint32_t __neon_uint32x2_t; -typedef __attribute__(( __vector_size__(16) )) uint32_t __neon_uint32x4_t; -typedef __attribute__(( __vector_size__(8) )) uint64_t __neon_uint64x1_t; -typedef __attribute__(( __vector_size__(16) )) uint64_t __neon_uint64x2_t; -typedef __attribute__(( __vector_size__(8) )) uint16_t __neon_float16x4_t; -typedef __attribute__(( __vector_size__(16) )) uint16_t __neon_float16x8_t; -typedef __attribute__(( __vector_size__(8) )) float32_t __neon_float32x2_t; -typedef __attribute__(( __vector_size__(16) )) float32_t __neon_float32x4_t; -typedef __attribute__(( __vector_size__(8) )) poly8_t __neon_poly8x8_t; -typedef __attribute__(( __vector_size__(16) )) poly8_t __neon_poly8x16_t; -typedef __attribute__(( __vector_size__(8) )) poly16_t __neon_poly16x4_t; -typedef __attribute__(( __vector_size__(16) )) poly16_t __neon_poly16x8_t; - -typedef struct __int8x8_t { - __neon_int8x8_t val; -} int8x8_t; - -typedef struct __int8x16_t { - __neon_int8x16_t val; -} int8x16_t; - -typedef struct __int16x4_t { - __neon_int16x4_t val; -} int16x4_t; - -typedef struct __int16x8_t { - __neon_int16x8_t val; -} int16x8_t; - -typedef struct __int32x2_t { - __neon_int32x2_t val; -} int32x2_t; - -typedef struct __int32x4_t { - __neon_int32x4_t val; -} int32x4_t; - -typedef struct __int64x1_t { - __neon_int64x1_t val; -} int64x1_t; - -typedef struct __int64x2_t { - __neon_int64x2_t val; -} int64x2_t; - -typedef struct __uint8x8_t { - __neon_uint8x8_t val; -} uint8x8_t; - -typedef struct __uint8x16_t { - __neon_uint8x16_t val; -} uint8x16_t; - -typedef struct __uint16x4_t { - __neon_uint16x4_t val; -} uint16x4_t; - -typedef struct __uint16x8_t { - __neon_uint16x8_t val; -} uint16x8_t; - -typedef struct __uint32x2_t { - __neon_uint32x2_t val; -} uint32x2_t; - -typedef struct __uint32x4_t { - __neon_uint32x4_t val; -} uint32x4_t; - -typedef struct __uint64x1_t { - __neon_uint64x1_t val; -} uint64x1_t; - -typedef struct __uint64x2_t { - __neon_uint64x2_t val; -} uint64x2_t; - -typedef struct __float16x4_t { - __neon_float16x4_t val; -} float16x4_t; - -typedef struct __float16x8_t { - __neon_float16x8_t val; -} float16x8_t; - -typedef struct __float32x2_t { - __neon_float32x2_t val; -} float32x2_t; - -typedef struct __float32x4_t { - __neon_float32x4_t val; -} float32x4_t; - -typedef struct __poly8x8_t { - __neon_poly8x8_t val; -} poly8x8_t; - -typedef struct __poly8x16_t { - __neon_poly8x16_t val; -} poly8x16_t; - -typedef struct __poly16x4_t { - __neon_poly16x4_t val; -} poly16x4_t; - -typedef struct __poly16x8_t { - __neon_poly16x8_t val; -} poly16x8_t; - -// FIXME: write tool to stamp out the structure-of-array types, possibly gen this whole file. - -// Intrinsics, per ARM document DUI0348B -#define __ai static __attribute__((__always_inline__)) - -#define INTTYPES_WIDE(op, builtin) \ - __ai int16x8_t op##_s8(int16x8_t a, int8x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \ - __ai int32x4_t op##_s16(int32x4_t a, int16x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \ - __ai int64x2_t op##_s32(int64x2_t a, int32x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \ - __ai uint16x8_t op##_u8(uint16x8_t a, uint8x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \ - __ai uint32x4_t op##_u16(uint32x4_t a, uint16x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } \ - __ai uint64x2_t op##_u32(uint64x2_t a, uint32x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; } - -#define INTTYPES_WIDENING(op, builtin) \ - __ai int16x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \ - __ai int32x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \ - __ai int64x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \ - __ai uint16x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \ - __ai uint32x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } \ - __ai uint64x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; } - -#define INTTYPES_WIDENING_MUL(op, builtin) \ - __ai int16x8_t op##_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return (int16x8_t){ builtin(a.val, b.val, c.val) }; } \ - __ai int32x4_t op##_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return (int32x4_t){ builtin(a.val, b.val, c.val) }; } \ - __ai int64x2_t op##_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return (int64x2_t){ builtin(a.val, b.val, c.val) }; } \ - __ai uint16x8_t op##_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return (uint16x8_t){ builtin(a.val, b.val, c.val) }; } \ - __ai uint32x4_t op##_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return (uint32x4_t){ builtin(a.val, b.val, c.val) }; } \ - __ai uint64x2_t op##_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return (uint64x2_t){ builtin(a.val, b.val, c.val) }; } - -#define INTTYPES_NARROWING(op, builtin) \ - __ai int8x8_t op##_s16(int16x8_t a, int16x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \ - __ai int16x4_t op##_s32(int32x4_t a, int32x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \ - __ai int32x2_t op##_s64(int64x2_t a, int64x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \ - __ai uint8x8_t op##_u16(uint16x8_t a, uint16x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \ - __ai uint16x4_t op##_u32(uint32x4_t a, uint32x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \ - __ai uint32x2_t op##_u64(uint64x2_t a, uint64x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } - -#define INTTYPES_ADD_32(op, builtin) \ - __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \ - __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \ - __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \ - __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \ - __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \ - __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \ - __ai int8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){ builtin(a.val, b.val) }; } \ - __ai int16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \ - __ai int32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \ - __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){ builtin(a.val, b.val) }; } \ - __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \ - __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } - -#define INTTYPES_ADD_64(op, builtin) \ - __ai int64x1_t op##_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){ builtin(a.val, b.val) }; } \ - __ai uint64x1_t op##_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){ builtin(a.val, b.val) }; } \ - __ai int64x2_t op##q_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \ - __ai uint64x2_t op##q_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; } - -#define FLOATTYPES_CMP(op, builtin) \ - __ai uint32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \ - __ai uint32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } - -#define INT_FLOAT_CMP_OP(op, cc) \ - __ai uint8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (uint8x8_t){(__neon_uint8x8_t)(a.val cc b.val)}; } \ - __ai uint16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (uint16x4_t){(__neon_uint16x4_t)(a.val cc b.val)}; } \ - __ai uint32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (uint32x2_t){(__neon_uint32x2_t)(a.val cc b.val)}; } \ - __ai uint32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (uint32x2_t){(__neon_uint32x2_t)(a.val cc b.val)}; } \ - __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){a.val cc b.val}; } \ - __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){a.val cc b.val}; } \ - __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){a.val cc b.val}; } \ - __ai uint8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (uint8x16_t){(__neon_uint8x16_t)(a.val cc b.val)}; } \ - __ai uint16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (uint16x8_t){(__neon_uint16x8_t)(a.val cc b.val)}; } \ - __ai uint32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (uint32x4_t){(__neon_uint32x4_t)(a.val cc b.val)}; } \ - __ai uint32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (uint32x4_t){(__neon_uint32x4_t)(a.val cc b.val)}; } \ - __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){a.val cc b.val}; } \ - __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){a.val cc b.val}; } \ - __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){a.val cc b.val}; } - -#define INT_UNARY(op, builtin) \ - __ai int8x8_t op##_s8(int8x8_t a) { return (int8x8_t){ builtin(a.val) }; } \ - __ai int16x4_t op##_s16(int16x4_t a) { return (int16x4_t){ builtin(a.val) }; } \ - __ai int32x2_t op##_s32(int32x2_t a) { return (int32x2_t){ builtin(a.val) }; } \ - __ai int8x16_t op##q_s8(int8x16_t a) { return (int8x16_t){ builtin(a.val) }; } \ - __ai int16x8_t op##q_s16(int16x8_t a) { return (int16x8_t){ builtin(a.val) }; } \ - __ai int32x4_t op##q_s32(int32x4_t a) { return (int32x4_t){ builtin(a.val) }; } - -#define FP_UNARY(op, builtin) \ - __ai float32x2_t op##_f32(float32x2_t a) { return (float32x2_t){ builtin(a.val) }; } \ - __ai float32x4_t op##q_f32(float32x4_t a) { return (float32x4_t){ builtin(a.val) }; } - -#define FP_BINARY(op, builtin) \ - __ai float32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){ builtin(a.val, b.val) }; } \ - __ai float32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (float32x4_t){ builtin(a.val, b.val) }; } - -#define INT_FP_PAIRWISE_ADD(op, builtin) \ - __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \ - __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \ - __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \ - __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \ - __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \ - __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \ - __ai float32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){ builtin(a.val, b.val) }; } - -#define INT_LOGICAL_OP(op, lop) \ - __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ a.val lop b.val }; } \ - __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ a.val lop b.val }; } \ - __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ a.val lop b.val }; } \ - __ai int64x1_t op##_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){ a.val lop b.val }; } \ - __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ a.val lop b.val }; } \ - __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ a.val lop b.val }; } \ - __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ a.val lop b.val }; } \ - __ai uint64x1_t op##_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){ a.val lop b.val }; } \ - __ai int8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){ a.val lop b.val }; } \ - __ai int16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){ a.val lop b.val }; } \ - __ai int32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){ a.val lop b.val }; } \ - __ai int64x2_t op##q_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){ a.val lop b.val }; } \ - __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){ a.val lop b.val }; } \ - __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){ a.val lop b.val }; } \ - __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){ a.val lop b.val }; } \ - __ai uint64x2_t op##q_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){ a.val lop b.val }; } - -// vector add -__ai int8x8_t vadd_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){a.val + b.val}; } -__ai int16x4_t vadd_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){a.val + b.val}; } -__ai int32x2_t vadd_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){a.val + b.val}; } -__ai int64x1_t vadd_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){a.val + b.val}; } -__ai float32x2_t vadd_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){a.val + b.val}; } -__ai uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){a.val + b.val}; } -__ai uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){a.val + b.val}; } -__ai uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){a.val + b.val}; } -__ai uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){a.val + b.val}; } -__ai int8x16_t vaddq_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){a.val + b.val}; } -__ai int16x8_t vaddq_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){a.val + b.val}; } -__ai int32x4_t vaddq_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){a.val + b.val}; } -__ai int64x2_t vaddq_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){a.val + b.val}; } -__ai float32x4_t vaddq_f32(float32x4_t a, float32x4_t b) { return (float32x4_t){a.val + b.val}; } -__ai uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){a.val + b.val}; } -__ai uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){a.val + b.val}; } -__ai uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){a.val + b.val}; } -__ai uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){a.val + b.val}; } - -// vector long add -INTTYPES_WIDENING(vaddl, __builtin_neon_vaddl) - -// vector wide add -INTTYPES_WIDE(vaddw, __builtin_neon_vaddw) - -// halving add -// rounding halving add -INTTYPES_ADD_32(vhadd, __builtin_neon_vhadd) -INTTYPES_ADD_32(vrhadd, __builtin_neon_vrhadd) - -// saturating add -INTTYPES_ADD_32(vqadd, __builtin_neon_vqadd) -INTTYPES_ADD_64(vqadd, __builtin_neon_vqadd) - -// add high half -// rounding add high half -INTTYPES_NARROWING(vaddhn, __builtin_neon_vaddhn) -INTTYPES_NARROWING(vraddhn, __builtin_neon_vraddhn) - -// multiply -// mul-poly - -// multiple accumulate -// multiple subtract - -// multiple accumulate long -// multiple subtract long -INTTYPES_WIDENING_MUL(vmlal, __builtin_neon_vmlal) -INTTYPES_WIDENING_MUL(vmlsl, __builtin_neon_vmlsl) - -// saturating doubling multiply high -// saturating rounding doubling multiply high - -// saturating doubling multiply accumulate long -// saturating doubling multiply subtract long - -// long multiply -// long multiply-poly -INTTYPES_WIDENING(vmull, __builtin_neon_vmull) -__ai poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) { return (poly16x8_t){ __builtin_neon_vmull(a.val, b.val) }; } - -// saturating doubling long multiply - -// subtract - -// long subtract -INTTYPES_WIDENING(vsubl, __builtin_neon_vsubl) - -// wide subtract -INTTYPES_WIDE(vsubw, __builtin_neon_vsubw) - -// saturating subtract -INTTYPES_ADD_32(vqsub, __builtin_neon_vqsub) -INTTYPES_ADD_64(vqsub, __builtin_neon_vqsub) - -// halving subtract -INTTYPES_ADD_32(vhsub, __builtin_neon_vhsub) - -// subtract high half -// rounding subtract high half -INTTYPES_NARROWING(vsubhn, __builtin_neon_vsubhn) -INTTYPES_NARROWING(vrsubhn, __builtin_neon_vrsubhn) - -// compare eq -// compare ge -// compare le -// compare gt -// compare lt -INT_FLOAT_CMP_OP(vceq, ==) -INT_FLOAT_CMP_OP(vcge, >=) -INT_FLOAT_CMP_OP(vcle, <=) -INT_FLOAT_CMP_OP(vcgt, >) -INT_FLOAT_CMP_OP(vclt, <) - -// compare eq-poly - -// compare abs ge -// compare abs le -// compare abs gt -// compare abs lt -FLOATTYPES_CMP(vcage, __builtin_neon_vcage) -FLOATTYPES_CMP(vcale, __builtin_neon_vcale) -FLOATTYPES_CMP(vcagt, __builtin_neon_vcagt) -FLOATTYPES_CMP(vcalt, __builtin_neon_vcalt) - -// test bits - -// abs diff -INTTYPES_ADD_32(vabd, __builtin_neon_vabd) -FP_BINARY(vabd, __builtin_neon_vabd) - -// abs diff long -INTTYPES_WIDENING(vabdl, __builtin_neon_vabdl) - -// abs diff accumulate -// abs diff accumulate long - -// max -// min -INTTYPES_ADD_32(vmax, __builtin_neon_vmax) -FP_BINARY(vmax, __builtin_neon_vmax) -INTTYPES_ADD_32(vmin, __builtin_neon_vmin) -FP_BINARY(vmin, __builtin_neon_vmin) - -// pairwise add -// pairwise max -// pairwise min -INT_FP_PAIRWISE_ADD(vpadd, __builtin_neon_vpadd) -INT_FP_PAIRWISE_ADD(vpmax, __builtin_neon_vpmax) -INT_FP_PAIRWISE_ADD(vpmin, __builtin_neon_vpmin) - -// long pairwise add -// long pairwise add accumulate - -// recip -// recip sqrt -FP_BINARY(vrecps, __builtin_neon_vrecps) -FP_BINARY(vrsqrts, __builtin_neon_vrsqrts) - -// shl by vec -// saturating shl by vec -// rounding shl by vec -// saturating rounding shl by vec - -// shr by constant -// shl by constant -// rounding shr by constant -// shr by constant and accumulate -// rounding shr by constant and accumulate -// saturating shl by constant -// s->u saturating shl by constant -// narrowing saturating shr by constant -// s->u narrowing saturating shr by constant -// s->u rounding narrowing saturating shr by constant -// narrowing saturating shr by constant -// rounding narrowing shr by constant -// rounding narrowing saturating shr by constant -// widening shl by constant - -// shr and insert -// shl and insert - -// loads and stores, single vector -// loads and stores, lane -// loads, dupe - -// loads and stores, arrays - -// vget,vgetq lane -// vset, vsetq lane - -// vcreate -// vdup, vdupq -// vmov, vmovq -// vdup_lane, vdupq_lane -// vcombine -// vget_high, vget_low - -// vcvt {u,s} <-> f, f <-> f16 -// narrow -// long move (unpack) -// saturating narrow -// saturating narrow s->u - -// table lookup -// extended table lookup - -// mla with scalar -// widening mla with scalar -// widening saturating doubling mla with scalar -// mls with scalar -// widening mls with scalar -// widening saturating doubling mls with scalar -// mul by scalar -// long mul with scalar -// long mul by scalar -// saturating doubling long mul with scalar -// saturating doubling long mul by scalar -// saturating doubling mul high with scalar -// saturating doubling mul high by scalar -// saturating rounding doubling mul high with scalar -// saturating rounding doubling mul high by scalar -// mla with scalar -// widening mla with sclar -// widening saturating doubling mla with scalar -// mls with scalar -// widening mls with scalar -// widening saturating doubling mls with scalar - -// extract - -// endian swap (vrev) - -// negate - -// abs -// saturating abs -// saturating negate -// count leading signs -INT_UNARY(vabs, __builtin_neon_vabs) -FP_UNARY(vabs, __builtin_neon_vabs) -INT_UNARY(vqabs, __builtin_neon_vqabs) -INT_UNARY(vqneg, __builtin_neon_vqneg) -INT_UNARY(vcls, __builtin_neon_vcls) - -// count leading zeroes -// popcount - -// recip_est -// recip_sqrt_est - -// not-poly -// not - -// and -// or -// xor -// andn -// orn -INT_LOGICAL_OP(vand, &) -INT_LOGICAL_OP(vorr, |) -INT_LOGICAL_OP(veor, ^) -INT_LOGICAL_OP(vbic, &~) -INT_LOGICAL_OP(vorn, |~) - -// bitselect - -// transpose elts -// interleave elts -// deinterleave elts - -// vreinterpret - -#endif /* __ARM_NEON_H */ |