11 "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
16#ifndef __AVX10_2_512BF16INTRIN_H
17#define __AVX10_2_512BF16INTRIN_H
20typedef __bf16 __m512bh_u
__attribute__((__vector_size__(64), __aligned__(1)));
23#define __DEFAULT_FN_ATTRS512 \
24 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
25 __min_vector_width__(512)))
32 return (__m512bh)__builtin_ia32_undef512();
36 return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
37 bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
38 bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
42 __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
43 __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
44 __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
45 __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
46 __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
47 __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
48 return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
49 bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
50 bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
51 bf8, bf7, bf6, bf5, bf4, bf3, bf2, bf1};
54#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \
55 bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19, \
56 bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28, \
57 bf29, bf30, bf31, bf32) \
58 _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26), \
59 (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19), \
60 (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12), \
61 (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), \
65_mm512_castpbf16_ps(__m512bh
__a) {
70_mm512_castpbf16_pd(__m512bh
__a) {
75_mm512_castpbf16_si512(__m512bh
__a) {
84_mm512_castpd_pbh(__m512d
__a) {
89_mm512_castsi512_pbh(__m512i
__a) {
94_mm512_castpbf16512_pbh128(__m512bh
__a) {
95 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
99_mm512_castpbf16512_pbh256(__m512bh
__a) {
100 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
105_mm512_castpbf16128_pbh512(__m128bh
__a) {
106 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
107 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
108 -1, -1, -1, -1, -1, -1, -1, -1, -1);
112_mm512_castpbf16256_pbh512(__m256bh
__a) {
113 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
114 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
115 -1, -1, -1, -1, -1, -1, -1, -1);
119_mm512_zextpbf16128_pbh512(__m128bh
__a) {
120 return __builtin_shufflevector(
121 __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
122 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
126_mm512_zextpbf16256_pbh512(__m256bh
__a) {
127 return __builtin_shufflevector(
__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
128 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
129 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
139_mm512_load_pbh(
void const *
__p) {
140 return *(
const __m512bh *)
__p;
144_mm512_loadu_pbh(
void const *
__p) {
148 return ((
const struct __loadu_pbh *)
__p)->__v;
153 *(__m512bh *)
__P = __A;
158 struct __storeu_pbh {
161 ((
struct __storeu_pbh *)
__P)->__v = __A;
165_mm512_mask_blend_pbh(
__mmask32 __U, __m512bh __A, __m512bh __W) {
166 return (__m512bh)__builtin_ia32_selectpbf_512((
__mmask32)__U, (__v32bf)__W,
171_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
172 return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
177_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
178 return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
182_mm512_addne_pbh(__m512bh __A, __m512bh __B) {
183 return (__m512bh)((__v32bf)__A + (__v32bf)__B);
187_mm512_mask_addne_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
188 return (__m512bh)__builtin_ia32_selectpbf_512(
189 (
__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B), (__v32bf)__W);
193_mm512_maskz_addne_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
194 return (__m512bh)__builtin_ia32_selectpbf_512(
195 (
__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B),
196 (__v32bf)_mm512_setzero_pbh());
200_mm512_subne_pbh(__m512bh __A, __m512bh __B) {
201 return (__m512bh)((__v32bf)__A - (__v32bf)__B);
205_mm512_mask_subne_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
206 return (__m512bh)__builtin_ia32_selectpbf_512(
207 (
__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B), (__v32bf)__W);
211_mm512_maskz_subne_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
212 return (__m512bh)__builtin_ia32_selectpbf_512(
213 (
__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B),
214 (__v32bf)_mm512_setzero_pbh());
218_mm512_mulne_pbh(__m512bh __A, __m512bh __B) {
219 return (__m512bh)((__v32bf)__A * (__v32bf)__B);
223_mm512_mask_mulne_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
224 return (__m512bh)__builtin_ia32_selectpbf_512(
225 (
__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B), (__v32bf)__W);
229_mm512_maskz_mulne_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
230 return (__m512bh)__builtin_ia32_selectpbf_512(
231 (
__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B),
232 (__v32bf)_mm512_setzero_pbh());
236_mm512_divne_pbh(__m512bh __A, __m512bh __B) {
237 return (__m512bh)((__v32bf)__A / (__v32bf)__B);
241_mm512_mask_divne_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
242 return (__m512bh)__builtin_ia32_selectpbf_512(
243 (
__mmask32)__U, (__v32bf)_mm512_divne_pbh(__A, __B), (__v32bf)__W);
247_mm512_maskz_divne_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
248 return (__m512bh)__builtin_ia32_selectpbf_512(
249 (
__mmask32)__U, (__v32bf)_mm512_divne_pbh(__A, __B),
250 (__v32bf)_mm512_setzero_pbh());
255 return (__m512bh)__builtin_ia32_vmaxpbf16512((__v32bf)__A, (__v32bf)__B);
259_mm512_mask_max_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
260 return (__m512bh)__builtin_ia32_selectpbf_512(
261 (
__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W);
265_mm512_maskz_max_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
266 return (__m512bh)__builtin_ia32_selectpbf_512(
267 (
__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B),
268 (__v32bf)_mm512_setzero_pbh());
273 return (__m512bh)__builtin_ia32_vminpbf16512((__v32bf)__A, (__v32bf)__B);
277_mm512_mask_min_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
278 return (__m512bh)__builtin_ia32_selectpbf_512(
279 (
__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W);
283_mm512_maskz_min_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
284 return (__m512bh)__builtin_ia32_selectpbf_512(
285 (
__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B),
286 (__v32bf)_mm512_setzero_pbh());
289#define _mm512_cmp_pbh_mask(__A, __B, __P) \
290 ((__mmask32)__builtin_ia32_vcmppbf16512_mask((__v32bf)(__m512bh)(__A), \
291 (__v32bf)(__m512bh)(__B), \
292 (int)(__P), (__mmask32) - 1))
294#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P) \
295 ((__mmask32)__builtin_ia32_vcmppbf16512_mask((__v32bf)(__m512bh)(__A), \
296 (__v32bf)(__m512bh)(__B), \
297 (int)(__P), (__mmask32)(__U)))
299#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm) \
300 ((__mmask32)__builtin_ia32_vfpclasspbf16512_mask( \
301 (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U)))
303#define _mm512_fpclass_pbh_mask(__A, imm) \
304 ((__mmask32)__builtin_ia32_vfpclasspbf16512_mask( \
305 (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1))
308_mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
309 return (__m512bh)__builtin_ia32_vscalefpbf16512_mask(
310 (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(),
315 __m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
316 return (__m512bh)__builtin_ia32_vscalefpbf16512_mask(
317 (__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (
__mmask32)__U);
321_mm512_maskz_scalef_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
322 return (__m512bh)__builtin_ia32_vscalefpbf16512_mask(
323 (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(),
328 return (__m512bh)__builtin_ia32_vrcppbf16512_mask(
329 (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (
__mmask32)-1);
333_mm512_mask_rcp_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A) {
334 return (__m512bh)__builtin_ia32_vrcppbf16512_mask((__v32bf)__A, (__v32bf)__W,
339_mm512_maskz_rcp_pbh(
__mmask32 __U, __m512bh __A) {
340 return (__m512bh)__builtin_ia32_vrcppbf16512_mask(
341 (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (
__mmask32)__U);
345_mm512_getexp_pbh(__m512bh __A) {
346 return (__m512bh)__builtin_ia32_vgetexppbf16512_mask(
347 (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (
__mmask32)-1);
351_mm512_mask_getexp_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A) {
352 return (__m512bh)__builtin_ia32_vgetexppbf16512_mask(
353 (__v32bf)__A, (__v32bf)__W, (
__mmask32)__U);
357_mm512_maskz_getexp_pbh(
__mmask32 __U, __m512bh __A) {
358 return (__m512bh)__builtin_ia32_vgetexppbf16512_mask(
359 (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (
__mmask32)__U);
363_mm512_rsqrt_pbh(__m512bh __A) {
364 return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask(
365 (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (
__mmask32)-1);
369_mm512_mask_rsqrt_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A) {
370 return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask(
371 (__v32bf)__A, (__v32bf)__W, (
__mmask32)__U);
375_mm512_maskz_rsqrt_pbh(
__mmask32 __U, __m512bh __A) {
376 return (__m512bh)__builtin_ia32_vrsqrtpbf16512_mask(
377 (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (
__mmask32)__U);
380#define _mm512_reducene_pbh(__A, imm) \
381 ((__m512bh)__builtin_ia32_vreducenepbf16512_mask( \
382 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(), \
385#define _mm512_mask_reducene_pbh(__W, __U, __A, imm) \
386 ((__m512bh)__builtin_ia32_vreducenepbf16512_mask( \
387 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \
390#define _mm512_maskz_reducene_pbh(__U, __A, imm) \
391 ((__m512bh)__builtin_ia32_vreducenepbf16512_mask( \
392 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
395#define _mm512_roundscalene_pbh(__A, imm) \
396 ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask( \
397 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
400#define _mm512_mask_roundscalene_pbh(__W, __U, __A, imm) \
401 ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask( \
402 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \
405#define _mm512_maskz_roundscalene_pbh(__U, __A, imm) \
406 ((__m512bh)__builtin_ia32_vrndscalenepbf16_mask( \
407 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
410#define _mm512_getmant_pbh(__A, __B, __C) \
411 ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask( \
412 (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
413 (__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1))
415#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C) \
416 ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask( \
417 (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
418 (__v32bf)(__m512bh)(__W), (__mmask32)(__U)))
420#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C) \
421 ((__m512bh)__builtin_ia32_vgetmantpbf16512_mask( \
422 (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
423 (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
426 return (__m512bh)__builtin_ia32_vsqrtnepbf16512((__v32bf)__A);
430_mm512_mask_sqrt_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A) {
431 return (__m512bh)__builtin_ia32_selectpbf_512(
432 (
__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W);
436_mm512_maskz_sqrt_pbh(
__mmask32 __U, __m512bh __A) {
437 return (__m512bh)__builtin_ia32_selectpbf_512((
__mmask32)__U,
438 (__v32bf)_mm512_sqrt_pbh(__A),
439 (__v32bf)_mm512_setzero_pbh());
443_mm512_fmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
444 return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
449 __m512bh __A,
__mmask32 __U, __m512bh __B, __m512bh __C) {
450 return (__m512bh)__builtin_ia32_selectpbf_512(
452 _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
457 __m512bh __A, __m512bh __B, __m512bh __C,
__mmask32 __U) {
458 return (__m512bh)__builtin_ia32_selectpbf_512(
460 _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
465 __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
466 return (__m512bh)__builtin_ia32_selectpbf_512(
468 _mm512_fmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
469 (__v32bf)_mm512_setzero_pbh());
473_mm512_fmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
474 return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
479 __m512bh __A,
__mmask32 __U, __m512bh __B, __m512bh __C) {
480 return (__m512bh)__builtin_ia32_selectpbf_512(
482 _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
487 __m512bh __A, __m512bh __B, __m512bh __C,
__mmask32 __U) {
488 return (__m512bh)__builtin_ia32_selectpbf_512(
490 _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
495 __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
496 return (__m512bh)__builtin_ia32_selectpbf_512(
498 _mm512_fmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
499 (__v32bf)_mm512_setzero_pbh());
503_mm512_fnmaddne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
504 return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
509 __m512bh __A,
__mmask32 __U, __m512bh __B, __m512bh __C) {
510 return (__m512bh)__builtin_ia32_selectpbf_512(
512 _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
517 __m512bh __A, __m512bh __B, __m512bh __C,
__mmask32 __U) {
518 return (__m512bh)__builtin_ia32_selectpbf_512(
520 _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
525 __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
526 return (__m512bh)__builtin_ia32_selectpbf_512(
528 _mm512_fnmaddne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
529 (__v32bf)_mm512_setzero_pbh());
533_mm512_fnmsubne_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
534 return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
539 __m512bh __A,
__mmask32 __U, __m512bh __B, __m512bh __C) {
540 return (__m512bh)__builtin_ia32_selectpbf_512(
542 _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
547 __m512bh __A, __m512bh __B, __m512bh __C,
__mmask32 __U) {
548 return (__m512bh)__builtin_ia32_selectpbf_512(
550 _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
555 __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
556 return (__m512bh)__builtin_ia32_selectpbf_512(
558 _mm512_fnmsubne_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
559 (__v32bf)_mm512_setzero_pbh());
562#undef __DEFAULT_FN_ATTRS512
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ uint32_t volatile uint32_t * __p
#define __DEFAULT_FN_ATTRS512
static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set1_epi32(int __s)
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline__ void int __a
struct __storeu_i16 *__P __v
__inline unsigned int unsigned int unsigned int * __P