diff options
Diffstat (limited to 'clang/lib/Headers/amxintrin.h')
-rw-r--r-- | clang/lib/Headers/amxintrin.h | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index ec67a87e39ca..baa56f5b28e8 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -22,6 +22,8 @@ __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) #define __DEFAULT_FN_ATTRS_BF16 \ __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16"))) +#define __DEFAULT_FN_ATTRS_FP16 \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16"))) /// Load tile configuration from a 64-byte memory location specified by /// "mem_addr". The tile configuration includes the tile type palette, the @@ -290,6 +292,13 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); } +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16 +_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2); +} + /// This struct pack the shape and tile data together for user. We suggest /// initializing the struct as early as possible, because compiler depends /// on the shape information to do configure. The constant value is preferred @@ -484,9 +493,32 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0, src0.tile, src1.tile); } +/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and +/// src1, accumulating the intermediate single-precision (32-bit) floating-point +/// elements with elements in "dst", and store the 32-bit result back to tile +/// "dst". +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_FP16 +static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + #undef __DEFAULT_FN_ATTRS_TILE #undef __DEFAULT_FN_ATTRS_INT8 #undef __DEFAULT_FN_ATTRS_BF16 +#undef __DEFAULT_FN_ATTRS_FP16 #endif /* __x86_64__ */ #endif /* __AMXINTRIN_H */ |