10#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
15#ifndef __AVX512VLBF16INTRIN_H
16#define __AVX512VLBF16INTRIN_H
18#define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx512vl,avx512bf16,no-evex512"), \
21 __min_vector_width__(128)))
22#define __DEFAULT_FN_ATTRS256 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx512vl,avx512bf16,no-evex512"), \
25 __min_vector_width__(256)))
40_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
41 return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
63_mm_mask_cvtne2ps_pbh(__m128bh __W,
__mmask8 __U, __m128 __A, __m128 __B) {
64 return (__m128bh)__builtin_ia32_selectpbf_128((
__mmask8)__U,
65 (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
85_mm_maskz_cvtne2ps_pbh(
__mmask8 __U, __m128 __A, __m128 __B) {
86 return (__m128bh)__builtin_ia32_selectpbf_128((
__mmask8)__U,
87 (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
104_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
105 return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
127_mm256_mask_cvtne2ps_pbh(__m256bh __W,
__mmask16 __U, __m256 __A, __m256 __B) {
128 return (__m256bh)__builtin_ia32_selectpbf_256((
__mmask16)__U,
129 (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
149_mm256_maskz_cvtne2ps_pbh(
__mmask16 __U, __m256 __A, __m256 __B) {
150 return (__m256bh)__builtin_ia32_selectpbf_256((
__mmask16)__U,
151 (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
165#define _mm_cvtneps_pbh(A) \
166 ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
184_mm_mask_cvtneps_pbh(__m128bh __W,
__mmask8 __U, __m128 __A) {
185 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
204_mm_maskz_cvtneps_pbh(
__mmask8 __U, __m128 __A) {
205 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
219#define _mm256_cvtneps_pbh(A) \
220 ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
237_mm256_mask_cvtneps_pbh(__m128bh __W,
__mmask8 __U, __m256 __A) {
238 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
256_mm256_maskz_cvtneps_pbh(
__mmask8 __U, __m256 __A) {
257 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
277_mm_dpbf16_ps(__m128
__D, __m128bh __A, __m128bh __B) {
278 return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)
__D,
301_mm_mask_dpbf16_ps(__m128
__D,
__mmask8 __U, __m128bh __A, __m128bh __B) {
302 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
303 (__v4sf)_mm_dpbf16_ps(
__D, __A, __B),
325_mm_maskz_dpbf16_ps(
__mmask8 __U, __m128
__D, __m128bh __A, __m128bh __B) {
326 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
327 (__v4sf)_mm_dpbf16_ps(
__D, __A, __B),
346_mm256_dpbf16_ps(__m256
__D, __m256bh __A, __m256bh __B) {
347 return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)
__D,
370_mm256_mask_dpbf16_ps(__m256
__D,
__mmask8 __U, __m256bh __A, __m256bh __B) {
371 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
372 (__v8sf)_mm256_dpbf16_ps(
__D, __A, __B),
394_mm256_maskz_dpbf16_ps(
__mmask8 __U, __m256
__D, __m256bh __A, __m256bh __B) {
395 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
396 (__v8sf)_mm256_dpbf16_ps(
__D, __A, __B),
411 __v4sf __V = {__A, 0, 0, 0};
412 __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
414 return (__bf16)__R[0];
452_mm_maskz_cvtpbh_ps(
__mmask8 __U, __m128bh __A) {
468_mm256_maskz_cvtpbh_ps(
__mmask8 __U, __m128bh __A) {
487_mm_mask_cvtpbh_ps(__m128 __S,
__mmask8 __U, __m128bh __A) {
507_mm256_mask_cvtpbh_ps(__m256 __S,
__mmask8 __U, __m128bh __A) {
513#undef __DEFAULT_FN_ATTRS128
514#undef __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi32(__m128i __V)
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit value...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32(__m256i __a, int __count)
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits,...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
static __inline__ void short __D
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V)
Sign-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...