10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, \
26 __target__("avx512fp16,evex512"), __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, \
29 __target__("avx512fp16,no-evex512"), \
30 __min_vector_width__(256)))
31#define __DEFAULT_FN_ATTRS128 \
32 __attribute__((__always_inline__, __nodebug__, \
33 __target__("avx512fp16,no-evex512"), \
34 __min_vector_width__(128)))
41 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
45 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
46 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
50 return (__m256h)__builtin_ia32_undef256();
54 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
55 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
60 return (__m128h)__builtin_ia32_undef128();
64 return (__m512h)__builtin_ia32_undef512();
68 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
69 __h, __h, __h, __h, __h, __h, __h, __h,
70 __h, __h, __h, __h, __h, __h, __h, __h,
71 __h, __h, __h, __h, __h, __h, __h, __h};
83 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
84 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
85 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
86 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
87 __h4, __h3, __h2, __h1};
90#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
91 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
92 h25, h26, h27, h28, h29, h30, h31, h32) \
93 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
94 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
95 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
96 (h5), (h4), (h3), (h2), (h1))
99_mm512_set1_pch(
_Float16 _Complex __h) {
132_mm256_castph_si256(__m256h
__a) {
137_mm512_castph_si512(__m512h
__a) {
170_mm256_castsi256_ph(__m256i
__a) {
175_mm512_castsi512_ph(__m512i
__a) {
180_mm256_castph256_ph128(__m256h
__a) {
181 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
185_mm512_castph512_ph128(__m512h
__a) {
186 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
190_mm512_castph512_ph256(__m512h
__a) {
191 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
196_mm256_castph128_ph256(__m128h
__a) {
197 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
198 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
202_mm512_castph128_ph512(__m128h
__a) {
203 __m256h
__b = __builtin_nondeterministic_value(
__b);
204 return __builtin_shufflevector(
205 __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
206 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
207 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
208 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
212_mm512_castph256_ph512(__m256h
__a) {
213 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a), 0,
214 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
215 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
233_mm256_zextph128_ph256(__m128h
__a) {
234 return __builtin_shufflevector(
__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
235 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
252_mm512_zextph128_ph512(__m128h
__a) {
253 return __builtin_shufflevector(
254 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
255 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
272_mm512_zextph256_ph512(__m256h
__a) {
273 return __builtin_shufflevector(
__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
274 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
275 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
279#define _mm_comi_round_sh(A, B, P, R) \
280 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
282#define _mm_comi_sh(A, B, pred) \
283 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
287 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OS,
293 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OS,
299 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OS,
305 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OS,
311 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OS,
317 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_US,
323 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OQ,
329 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OQ,
335 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OQ,
341 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OQ,
347 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OQ,
353 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_UQ,
359 return (__m512h)((__v32hf)__A + (__v32hf)__B);
363_mm512_mask_add_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
364 return (__m512h)__builtin_ia32_selectph_512(
365 (
__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
369_mm512_maskz_add_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
370 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
371 (__v32hf)_mm512_add_ph(__A, __B),
372 (__v32hf)_mm512_setzero_ph());
375#define _mm512_add_round_ph(A, B, R) \
376 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
377 (__v32hf)(__m512h)(B), (int)(R)))
379#define _mm512_mask_add_round_ph(W, U, A, B, R) \
380 ((__m512h)__builtin_ia32_selectph_512( \
381 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
382 (__v32hf)(__m512h)(W)))
384#define _mm512_maskz_add_round_ph(U, A, B, R) \
385 ((__m512h)__builtin_ia32_selectph_512( \
386 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
387 (__v32hf)_mm512_setzero_ph()))
391 return (__m512h)((__v32hf)__A - (__v32hf)__B);
395_mm512_mask_sub_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
396 return (__m512h)__builtin_ia32_selectph_512(
397 (
__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
401_mm512_maskz_sub_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
402 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
403 (__v32hf)_mm512_sub_ph(__A, __B),
404 (__v32hf)_mm512_setzero_ph());
407#define _mm512_sub_round_ph(A, B, R) \
408 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
409 (__v32hf)(__m512h)(B), (int)(R)))
411#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
412 ((__m512h)__builtin_ia32_selectph_512( \
413 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
414 (__v32hf)(__m512h)(W)))
416#define _mm512_maskz_sub_round_ph(U, A, B, R) \
417 ((__m512h)__builtin_ia32_selectph_512( \
418 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
419 (__v32hf)_mm512_setzero_ph()))
423 return (__m512h)((__v32hf)__A * (__v32hf)__B);
427_mm512_mask_mul_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
428 return (__m512h)__builtin_ia32_selectph_512(
429 (
__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
433_mm512_maskz_mul_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
434 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
435 (__v32hf)_mm512_mul_ph(__A, __B),
436 (__v32hf)_mm512_setzero_ph());
439#define _mm512_mul_round_ph(A, B, R) \
440 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
441 (__v32hf)(__m512h)(B), (int)(R)))
443#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
444 ((__m512h)__builtin_ia32_selectph_512( \
445 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
446 (__v32hf)(__m512h)(W)))
448#define _mm512_maskz_mul_round_ph(U, A, B, R) \
449 ((__m512h)__builtin_ia32_selectph_512( \
450 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
451 (__v32hf)_mm512_setzero_ph()))
455 return (__m512h)((__v32hf)__A / (__v32hf)__B);
459_mm512_mask_div_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
460 return (__m512h)__builtin_ia32_selectph_512(
461 (
__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
465_mm512_maskz_div_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
466 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
467 (__v32hf)_mm512_div_ph(__A, __B),
468 (__v32hf)_mm512_setzero_ph());
471#define _mm512_div_round_ph(A, B, R) \
472 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
473 (__v32hf)(__m512h)(B), (int)(R)))
475#define _mm512_mask_div_round_ph(W, U, A, B, R) \
476 ((__m512h)__builtin_ia32_selectph_512( \
477 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
478 (__v32hf)(__m512h)(W)))
480#define _mm512_maskz_div_round_ph(U, A, B, R) \
481 ((__m512h)__builtin_ia32_selectph_512( \
482 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
483 (__v32hf)_mm512_setzero_ph()))
487 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
492_mm512_mask_min_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
493 return (__m512h)__builtin_ia32_selectph_512(
494 (
__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
498_mm512_maskz_min_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
499 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
500 (__v32hf)_mm512_min_ph(__A, __B),
501 (__v32hf)_mm512_setzero_ph());
504#define _mm512_min_round_ph(A, B, R) \
505 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
506 (__v32hf)(__m512h)(B), (int)(R)))
508#define _mm512_mask_min_round_ph(W, U, A, B, R) \
509 ((__m512h)__builtin_ia32_selectph_512( \
510 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
511 (__v32hf)(__m512h)(W)))
513#define _mm512_maskz_min_round_ph(U, A, B, R) \
514 ((__m512h)__builtin_ia32_selectph_512( \
515 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
516 (__v32hf)_mm512_setzero_ph()))
520 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
525_mm512_mask_max_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
526 return (__m512h)__builtin_ia32_selectph_512(
527 (
__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
531_mm512_maskz_max_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
532 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
533 (__v32hf)_mm512_max_ph(__A, __B),
534 (__v32hf)_mm512_setzero_ph());
537#define _mm512_max_round_ph(A, B, R) \
538 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
539 (__v32hf)(__m512h)(B), (int)(R)))
541#define _mm512_mask_max_round_ph(W, U, A, B, R) \
542 ((__m512h)__builtin_ia32_selectph_512( \
543 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
544 (__v32hf)(__m512h)(W)))
546#define _mm512_maskz_max_round_ph(U, A, B, R) \
547 ((__m512h)__builtin_ia32_selectph_512( \
548 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
549 (__v32hf)_mm512_setzero_ph()))
560_mm512_mask_conj_pch(__m512h __W,
__mmask16 __U, __m512h __A) {
561 return (__m512h)__builtin_ia32_selectps_512(
562 (
__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
566_mm512_maskz_conj_pch(
__mmask16 __U, __m512h __A) {
567 return (__m512h)__builtin_ia32_selectps_512((
__mmask16)__U,
568 (__v16sf)_mm512_conj_pch(__A),
582 __A = _mm_add_sh(__A, __B);
583 return __builtin_ia32_selectsh_128(__U, __A, __W);
589 __A = _mm_add_sh(__A, __B);
590 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
593#define _mm_add_round_sh(A, B, R) \
594 ((__m128h)__builtin_ia32_addsh_round_mask( \
595 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
596 (__mmask8)-1, (int)(R)))
598#define _mm_mask_add_round_sh(W, U, A, B, R) \
599 ((__m128h)__builtin_ia32_addsh_round_mask( \
600 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
601 (__mmask8)(U), (int)(R)))
603#define _mm_maskz_add_round_sh(U, A, B, R) \
604 ((__m128h)__builtin_ia32_addsh_round_mask( \
605 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
606 (__mmask8)(U), (int)(R)))
618 __A = _mm_sub_sh(__A, __B);
619 return __builtin_ia32_selectsh_128(__U, __A, __W);
625 __A = _mm_sub_sh(__A, __B);
626 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
629#define _mm_sub_round_sh(A, B, R) \
630 ((__m128h)__builtin_ia32_subsh_round_mask( \
631 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
632 (__mmask8)-1, (int)(R)))
634#define _mm_mask_sub_round_sh(W, U, A, B, R) \
635 ((__m128h)__builtin_ia32_subsh_round_mask( \
636 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
637 (__mmask8)(U), (int)(R)))
639#define _mm_maskz_sub_round_sh(U, A, B, R) \
640 ((__m128h)__builtin_ia32_subsh_round_mask( \
641 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
642 (__mmask8)(U), (int)(R)))
654 __A = _mm_mul_sh(__A, __B);
655 return __builtin_ia32_selectsh_128(__U, __A, __W);
661 __A = _mm_mul_sh(__A, __B);
662 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
665#define _mm_mul_round_sh(A, B, R) \
666 ((__m128h)__builtin_ia32_mulsh_round_mask( \
667 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
668 (__mmask8)-1, (int)(R)))
670#define _mm_mask_mul_round_sh(W, U, A, B, R) \
671 ((__m128h)__builtin_ia32_mulsh_round_mask( \
672 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
673 (__mmask8)(U), (int)(R)))
675#define _mm_maskz_mul_round_sh(U, A, B, R) \
676 ((__m128h)__builtin_ia32_mulsh_round_mask( \
677 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
678 (__mmask8)(U), (int)(R)))
690 __A = _mm_div_sh(__A, __B);
691 return __builtin_ia32_selectsh_128(__U, __A, __W);
697 __A = _mm_div_sh(__A, __B);
698 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
701#define _mm_div_round_sh(A, B, R) \
702 ((__m128h)__builtin_ia32_divsh_round_mask( \
703 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
704 (__mmask8)-1, (int)(R)))
706#define _mm_mask_div_round_sh(W, U, A, B, R) \
707 ((__m128h)__builtin_ia32_divsh_round_mask( \
708 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
709 (__mmask8)(U), (int)(R)))
711#define _mm_maskz_div_round_sh(U, A, B, R) \
712 ((__m128h)__builtin_ia32_divsh_round_mask( \
713 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
714 (__mmask8)(U), (int)(R)))
718 return (__m128h)__builtin_ia32_minsh_round_mask(
719 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
727 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
735 return (__m128h)__builtin_ia32_minsh_round_mask(
736 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
740#define _mm_min_round_sh(A, B, R) \
741 ((__m128h)__builtin_ia32_minsh_round_mask( \
742 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
743 (__mmask8)-1, (int)(R)))
745#define _mm_mask_min_round_sh(W, U, A, B, R) \
746 ((__m128h)__builtin_ia32_minsh_round_mask( \
747 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
748 (__mmask8)(U), (int)(R)))
750#define _mm_maskz_min_round_sh(U, A, B, R) \
751 ((__m128h)__builtin_ia32_minsh_round_mask( \
752 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
753 (__mmask8)(U), (int)(R)))
757 return (__m128h)__builtin_ia32_maxsh_round_mask(
758 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
766 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
774 return (__m128h)__builtin_ia32_maxsh_round_mask(
775 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
779#define _mm_max_round_sh(A, B, R) \
780 ((__m128h)__builtin_ia32_maxsh_round_mask( \
781 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
782 (__mmask8)-1, (int)(R)))
784#define _mm_mask_max_round_sh(W, U, A, B, R) \
785 ((__m128h)__builtin_ia32_maxsh_round_mask( \
786 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
787 (__mmask8)(U), (int)(R)))
789#define _mm_maskz_max_round_sh(U, A, B, R) \
790 ((__m128h)__builtin_ia32_maxsh_round_mask( \
791 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
792 (__mmask8)(U), (int)(R)))
794#define _mm512_cmp_round_ph_mask(A, B, P, R) \
795 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
796 (__v32hf)(__m512h)(B), (int)(P), \
797 (__mmask32)-1, (int)(R)))
799#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
800 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
801 (__v32hf)(__m512h)(B), (int)(P), \
802 (__mmask32)(U), (int)(R)))
804#define _mm512_cmp_ph_mask(A, B, P) \
805 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
807#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
808 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
810#define _mm_cmp_round_sh_mask(X, Y, P, R) \
811 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
812 (__v8hf)(__m128h)(Y), (int)(P), \
813 (__mmask8)-1, (int)(R)))
815#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
816 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
817 (__v8hf)(__m128h)(Y), (int)(P), \
818 (__mmask8)(M), (int)(R)))
820#define _mm_cmp_sh_mask(X, Y, P) \
821 ((__mmask8)__builtin_ia32_cmpsh_mask( \
822 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
823 _MM_FROUND_CUR_DIRECTION))
825#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
826 ((__mmask8)__builtin_ia32_cmpsh_mask( \
827 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
828 _MM_FROUND_CUR_DIRECTION))
831 struct __mm_load_sh_struct {
834 _Float16 __u = ((
const struct __mm_load_sh_struct *)__dp)->__u;
835 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
839_mm_mask_load_sh(__m128h __W,
__mmask8 __U,
const void *__A) {
840 __m128h src = (__v8hf)__builtin_shufflevector(
841 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
843 return (__m128h)__builtin_ia32_loadsh128_mask((
const __v8hf *)__A, src, __U & 1);
847_mm_maskz_load_sh(
__mmask8 __U,
const void *__A) {
848 return (__m128h)__builtin_ia32_loadsh128_mask(
849 (
const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
853_mm512_load_ph(
void const *
__p) {
854 return *(
const __m512h *)
__p;
858_mm256_load_ph(
void const *
__p) {
859 return *(
const __m256h *)
__p;
863 return *(
const __m128h *)
__p;
867_mm512_loadu_ph(
void const *
__p) {
871 return ((
const struct __loadu_ph *)
__p)->__v;
875_mm256_loadu_ph(
void const *
__p) {
879 return ((
const struct __loadu_ph *)
__p)->__v;
886 return ((
const struct __loadu_ph *)
__p)->__v;
892 struct __mm_store_sh_struct {
895 ((
struct __mm_store_sh_struct *)__dp)->__u =
__a[0];
901 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
906 *(__m512h *)
__P = __A;
911 *(__m256h *)
__P = __A;
916 *(__m128h *)
__P = __A;
924 ((
struct __storeu_ph *)
__P)->__v = __A;
932 ((
struct __storeu_ph *)
__P)->__v = __A;
940 ((
struct __storeu_ph *)
__P)->__v = __A;
954 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
960 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
966 return (__m128i)(__v8hi){
__a, 0, 0, 0, 0, 0, 0, 0};
975 return (__m512h)__builtin_ia32_rcpph512_mask(
976 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
980_mm512_mask_rcp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
981 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
986_mm512_maskz_rcp_ph(
__mmask32 __U, __m512h __A) {
987 return (__m512h)__builtin_ia32_rcpph512_mask(
988 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
992 return (__m512h)__builtin_ia32_rsqrtph512_mask(
993 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
997_mm512_mask_rsqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
998 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1003_mm512_maskz_rsqrt_ph(
__mmask32 __U, __m512h __A) {
1004 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1005 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
1008#define _mm512_getmant_ph(A, B, C) \
1009 ((__m512h)__builtin_ia32_getmantph512_mask( \
1010 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1011 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1012 _MM_FROUND_CUR_DIRECTION))
1014#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1015 ((__m512h)__builtin_ia32_getmantph512_mask( \
1016 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1017 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1019#define _mm512_maskz_getmant_ph(U, A, B, C) \
1020 ((__m512h)__builtin_ia32_getmantph512_mask( \
1021 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1022 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1024#define _mm512_getmant_round_ph(A, B, C, R) \
1025 ((__m512h)__builtin_ia32_getmantph512_mask( \
1026 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1027 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1029#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1030 ((__m512h)__builtin_ia32_getmantph512_mask( \
1031 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1032 (__mmask32)(U), (int)(R)))
1034#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1035 ((__m512h)__builtin_ia32_getmantph512_mask( \
1036 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1037 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1040 return (__m512h)__builtin_ia32_getexpph512_mask(
1041 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1046_mm512_mask_getexp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1047 return (__m512h)__builtin_ia32_getexpph512_mask(
1052_mm512_maskz_getexp_ph(
__mmask32 __U, __m512h __A) {
1053 return (__m512h)__builtin_ia32_getexpph512_mask(
1054 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1058#define _mm512_getexp_round_ph(A, R) \
1059 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1060 (__v32hf)_mm512_undefined_ph(), \
1061 (__mmask32)-1, (int)(R)))
1063#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1064 ((__m512h)__builtin_ia32_getexpph512_mask( \
1065 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1067#define _mm512_maskz_getexp_round_ph(U, A, R) \
1068 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1069 (__v32hf)_mm512_setzero_ph(), \
1070 (__mmask32)(U), (int)(R)))
1074 return (__m512h)__builtin_ia32_scalefph512_mask(
1075 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1080_mm512_mask_scalef_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
1081 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1087_mm512_maskz_scalef_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
1088 return (__m512h)__builtin_ia32_scalefph512_mask(
1089 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1093#define _mm512_scalef_round_ph(A, B, R) \
1094 ((__m512h)__builtin_ia32_scalefph512_mask( \
1095 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1096 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1098#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1099 ((__m512h)__builtin_ia32_scalefph512_mask( \
1100 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1101 (__mmask32)(U), (int)(R)))
1103#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1104 ((__m512h)__builtin_ia32_scalefph512_mask( \
1105 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1106 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1108#define _mm512_roundscale_ph(A, B) \
1109 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1110 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1111 _MM_FROUND_CUR_DIRECTION))
1113#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1114 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1115 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1116 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1118#define _mm512_maskz_roundscale_ph(A, B, imm) \
1119 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1120 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1121 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1123#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1124 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1125 (__v32hf)(__m512h)(A), \
1126 (__mmask32)(B), (int)(R)))
1128#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1129 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1130 (__v32hf)_mm512_setzero_ph(), \
1131 (__mmask32)(A), (int)(R)))
1133#define _mm512_roundscale_round_ph(A, imm, R) \
1134 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1135 (__v32hf)_mm512_undefined_ph(), \
1136 (__mmask32)-1, (int)(R)))
1138#define _mm512_reduce_ph(A, imm) \
1139 ((__m512h)__builtin_ia32_reduceph512_mask( \
1140 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1141 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1143#define _mm512_mask_reduce_ph(W, U, A, imm) \
1144 ((__m512h)__builtin_ia32_reduceph512_mask( \
1145 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1146 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1148#define _mm512_maskz_reduce_ph(U, A, imm) \
1149 ((__m512h)__builtin_ia32_reduceph512_mask( \
1150 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1151 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1153#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1154 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1155 (__v32hf)(__m512h)(W), \
1156 (__mmask32)(U), (int)(R)))
1158#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1159 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1160 (__v32hf)_mm512_setzero_ph(), \
1161 (__mmask32)(U), (int)(R)))
1163#define _mm512_reduce_round_ph(A, imm, R) \
1164 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1165 (__v32hf)_mm512_undefined_ph(), \
1166 (__mmask32)-1, (int)(R)))
1170 return (__m128h)__builtin_ia32_rcpsh_mask(
1171 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1178 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1185 return (__m128h)__builtin_ia32_rcpsh_mask(
1186 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1191 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1192 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1199 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1204_mm_maskz_rsqrt_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1205 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1206 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1209#define _mm_getmant_round_sh(A, B, C, D, R) \
1210 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1211 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1212 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1214#define _mm_getmant_sh(A, B, C, D) \
1215 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1216 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1217 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1219#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1220 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1221 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1222 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1224#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1225 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1226 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1227 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1229#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1230 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1231 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1232 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1234#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1235 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1236 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1237 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1239#define _mm_getexp_round_sh(A, B, R) \
1240 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1241 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1242 (__mmask8)-1, (int)(R)))
1246 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1247 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1252_mm_mask_getexp_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1253 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1254 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (
__mmask8)__U,
1258#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1259 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1260 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1261 (__mmask8)(U), (int)(R)))
1264_mm_maskz_getexp_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1265 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1266 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1270#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1271 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1272 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1273 (__mmask8)(U), (int)(R)))
1275#define _mm_scalef_round_sh(A, B, R) \
1276 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1277 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1278 (__mmask8)-1, (int)(R)))
1282 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1283 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1288_mm_mask_scalef_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1289 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1294#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1295 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1296 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1297 (__mmask8)(U), (int)(R)))
1300_mm_maskz_scalef_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1301 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1302 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1306#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1307 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1308 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1309 (__mmask8)(U), (int)(R)))
1311#define _mm_roundscale_round_sh(A, B, imm, R) \
1312 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1313 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1314 (__mmask8)-1, (int)(imm), (int)(R)))
1316#define _mm_roundscale_sh(A, B, imm) \
1317 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1318 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1319 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1321#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1322 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1323 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1324 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1326#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1327 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1328 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1329 (__mmask8)(U), (int)(I), (int)(R)))
1331#define _mm_maskz_roundscale_sh(U, A, B, I) \
1332 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1333 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1334 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1336#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1337 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1338 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1339 (__mmask8)(U), (int)(I), (int)(R)))
1341#define _mm_reduce_sh(A, B, C) \
1342 ((__m128h)__builtin_ia32_reducesh_mask( \
1343 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1344 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1346#define _mm_mask_reduce_sh(W, U, A, B, C) \
1347 ((__m128h)__builtin_ia32_reducesh_mask( \
1348 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1349 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1351#define _mm_maskz_reduce_sh(U, A, B, C) \
1352 ((__m128h)__builtin_ia32_reducesh_mask( \
1353 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1354 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1356#define _mm_reduce_round_sh(A, B, C, R) \
1357 ((__m128h)__builtin_ia32_reducesh_mask( \
1358 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1359 (__mmask8)-1, (int)(C), (int)(R)))
1361#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1362 ((__m128h)__builtin_ia32_reducesh_mask( \
1363 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1364 (__mmask8)(U), (int)(C), (int)(R)))
1366#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1367 ((__m128h)__builtin_ia32_reducesh_mask( \
1368 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1369 (__mmask8)(U), (int)(C), (int)(R)))
1371#define _mm512_sqrt_round_ph(A, R) \
1372 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1374#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1375 ((__m512h)__builtin_ia32_selectph_512( \
1376 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1377 (__v32hf)(__m512h)(W)))
1379#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1380 ((__m512h)__builtin_ia32_selectph_512( \
1381 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1382 (__v32hf)_mm512_setzero_ph()))
1385 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1390_mm512_mask_sqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1391 return (__m512h)__builtin_ia32_selectph_512(
1394 (__v32hf)(__m512h)(__W));
1398_mm512_maskz_sqrt_ph(
__mmask32 __U, __m512h __A) {
1399 return (__m512h)__builtin_ia32_selectph_512(
1402 (__v32hf)_mm512_setzero_ph());
1405#define _mm_sqrt_round_sh(A, B, R) \
1406 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1407 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1408 (__mmask8)-1, (int)(R)))
1410#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1411 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1412 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1413 (__mmask8)(U), (int)(R)))
1415#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1416 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1417 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1418 (__mmask8)(U), (int)(R)))
1422 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1423 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1431 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1432 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1439 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1440 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1444#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1445 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1446 (int)(imm), (__mmask32)(U)))
1448#define _mm512_fpclass_ph_mask(A, imm) \
1449 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1450 (int)(imm), (__mmask32)-1))
1452#define _mm_fpclass_sh_mask(A, imm) \
1453 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1456#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1457 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1460#define _mm512_cvt_roundpd_ph(A, R) \
1461 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1462 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1464#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1465 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1466 (__mmask8)(U), (int)(R)))
1468#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1469 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1470 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1473 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1474 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1479_mm512_mask_cvtpd_ph(__m128h __W,
__mmask8 __U, __m512d __A) {
1480 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1485_mm512_maskz_cvtpd_ph(
__mmask8 __U, __m512d __A) {
1486 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1487 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1491#define _mm512_cvt_roundph_pd(A, R) \
1492 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1493 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1495#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1496 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1497 (__mmask8)(U), (int)(R)))
1499#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1500 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1501 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1504 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1510_mm512_mask_cvtph_pd(__m512d __W,
__mmask8 __U, __m128h __A) {
1511 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1516_mm512_maskz_cvtph_pd(
__mmask8 __U, __m128h __A) {
1517 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1522#define _mm_cvt_roundsh_ss(A, B, R) \
1523 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1524 (__v4sf)_mm_undefined_ps(), \
1525 (__mmask8)(-1), (int)(R)))
1527#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1528 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1529 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1531#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1532 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1533 (__v4sf)_mm_setzero_ps(), \
1534 (__mmask8)(U), (int)(R)))
1538 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1547 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1555 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1560#define _mm_cvt_roundss_sh(A, B, R) \
1561 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1562 (__v8hf)_mm_undefined_ph(), \
1563 (__mmask8)(-1), (int)(R)))
1565#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1566 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1567 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1569#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1570 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1571 (__v8hf)_mm_setzero_ph(), \
1572 (__mmask8)(U), (int)(R)))
1576 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1577 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1585 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1586 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (
__mmask8)__U,
1593 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1594 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1598#define _mm_cvt_roundsd_sh(A, B, R) \
1599 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1600 (__v8hf)_mm_undefined_ph(), \
1601 (__mmask8)(-1), (int)(R)))
1603#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1604 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1605 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1607#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1608 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1609 (__v8hf)_mm_setzero_ph(), \
1610 (__mmask8)(U), (int)(R)))
1614 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1615 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1623 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1624 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (
__mmask8)__U,
1629_mm_maskz_cvtsd_sh(
__mmask8 __U, __m128h __A, __m128d __B) {
1630 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1631 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1635#define _mm_cvt_roundsh_sd(A, B, R) \
1636 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1637 (__v2df)_mm_undefined_pd(), \
1638 (__mmask8)(-1), (int)(R)))
1640#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1641 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1642 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1644#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1645 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1646 (__v2df)_mm_setzero_pd(), \
1647 (__mmask8)(U), (int)(R)))
1651 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1660 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1661 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (
__mmask8)__U,
1666_mm_maskz_cvtsh_sd(
__mmask8 __U, __m128d __A, __m128h __B) {
1667 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1672#define _mm512_cvt_roundph_epi16(A, R) \
1673 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1674 (__v32hi)_mm512_undefined_epi32(), \
1675 (__mmask32)(-1), (int)(R)))
1677#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1678 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1679 (__mmask32)(U), (int)(R)))
1681#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1682 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1683 (__v32hi)_mm512_setzero_epi32(), \
1684 (__mmask32)(U), (int)(R)))
1687_mm512_cvtph_epi16(__m512h __A) {
1688 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1694_mm512_mask_cvtph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1695 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1700_mm512_maskz_cvtph_epi16(
__mmask32 __U, __m512h __A) {
1701 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1706#define _mm512_cvtt_roundph_epi16(A, R) \
1707 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1708 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1711#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1712 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1713 (__mmask32)(U), (int)(R)))
1715#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1716 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1717 (__v32hi)_mm512_setzero_epi32(), \
1718 (__mmask32)(U), (int)(R)))
1721_mm512_cvttph_epi16(__m512h __A) {
1722 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1728_mm512_mask_cvttph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1729 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1734_mm512_maskz_cvttph_epi16(
__mmask32 __U, __m512h __A) {
1735 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1740#define _mm512_cvt_roundepi16_ph(A, R) \
1741 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1742 (__v32hf)_mm512_undefined_ph(), \
1743 (__mmask32)(-1), (int)(R)))
1745#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1746 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1747 (__mmask32)(U), (int)(R)))
1749#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1750 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1751 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1754_mm512_cvtepi16_ph(__m512i __A) {
1755 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1756 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1761_mm512_mask_cvtepi16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1762 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1767_mm512_maskz_cvtepi16_ph(
__mmask32 __U, __m512i __A) {
1768 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1769 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1773#define _mm512_cvt_roundph_epu16(A, R) \
1774 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1775 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1778#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1779 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1780 (__mmask32)(U), (int)(R)))
1782#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1783 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1784 (__v32hu)_mm512_setzero_epi32(), \
1785 (__mmask32)(U), (int)(R)))
1788_mm512_cvtph_epu16(__m512h __A) {
1789 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1795_mm512_mask_cvtph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1796 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1801_mm512_maskz_cvtph_epu16(
__mmask32 __U, __m512h __A) {
1802 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1807#define _mm512_cvtt_roundph_epu16(A, R) \
1808 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1809 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1812#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1813 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1814 (__mmask32)(U), (int)(R)))
1816#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1817 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1818 (__v32hu)_mm512_setzero_epi32(), \
1819 (__mmask32)(U), (int)(R)))
1822_mm512_cvttph_epu16(__m512h __A) {
1823 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1829_mm512_mask_cvttph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1830 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1835_mm512_maskz_cvttph_epu16(
__mmask32 __U, __m512h __A) {
1836 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1841#define _mm512_cvt_roundepu16_ph(A, R) \
1842 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1843 (__v32hf)_mm512_undefined_ph(), \
1844 (__mmask32)(-1), (int)(R)))
1846#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1847 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1848 (__mmask32)(U), (int)(R)))
1850#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1851 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1852 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1855_mm512_cvtepu16_ph(__m512i __A) {
1856 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1857 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1862_mm512_mask_cvtepu16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1863 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1868_mm512_maskz_cvtepu16_ph(
__mmask32 __U, __m512i __A) {
1869 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1870 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1874#define _mm512_cvt_roundph_epi32(A, R) \
1875 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1876 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1879#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1880 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1881 (__mmask16)(U), (int)(R)))
1883#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1884 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1885 (__v16si)_mm512_setzero_epi32(), \
1886 (__mmask16)(U), (int)(R)))
1889_mm512_cvtph_epi32(__m256h __A) {
1890 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1896_mm512_mask_cvtph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
1897 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1902_mm512_maskz_cvtph_epi32(
__mmask16 __U, __m256h __A) {
1903 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1908#define _mm512_cvt_roundph_epu32(A, R) \
1909 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1910 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1913#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1914 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1915 (__mmask16)(U), (int)(R)))
1917#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1918 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1919 (__v16su)_mm512_setzero_epi32(), \
1920 (__mmask16)(U), (int)(R)))
1923_mm512_cvtph_epu32(__m256h __A) {
1924 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1930_mm512_mask_cvtph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
1931 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1936_mm512_maskz_cvtph_epu32(
__mmask16 __U, __m256h __A) {
1937 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1942#define _mm512_cvt_roundepi32_ph(A, R) \
1943 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1944 (__v16hf)_mm256_undefined_ph(), \
1945 (__mmask16)(-1), (int)(R)))
1947#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1948 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1949 (__mmask16)(U), (int)(R)))
1951#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1952 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1953 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1956_mm512_cvtepi32_ph(__m512i __A) {
1957 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1958 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
1963_mm512_mask_cvtepi32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
1964 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1969_mm512_maskz_cvtepi32_ph(
__mmask16 __U, __m512i __A) {
1970 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1971 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
1975#define _mm512_cvt_roundepu32_ph(A, R) \
1976 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1977 (__v16hf)_mm256_undefined_ph(), \
1978 (__mmask16)(-1), (int)(R)))
1980#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1981 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1982 (__mmask16)(U), (int)(R)))
1984#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1985 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1986 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1989_mm512_cvtepu32_ph(__m512i __A) {
1990 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1991 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
1996_mm512_mask_cvtepu32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
1997 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2002_mm512_maskz_cvtepu32_ph(
__mmask16 __U, __m512i __A) {
2003 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2004 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2008#define _mm512_cvtt_roundph_epi32(A, R) \
2009 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2010 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2013#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2014 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2015 (__mmask16)(U), (int)(R)))
2017#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2018 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2019 (__v16si)_mm512_setzero_epi32(), \
2020 (__mmask16)(U), (int)(R)))
2023_mm512_cvttph_epi32(__m256h __A) {
2024 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2030_mm512_mask_cvttph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
2031 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2036_mm512_maskz_cvttph_epi32(
__mmask16 __U, __m256h __A) {
2037 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2042#define _mm512_cvtt_roundph_epu32(A, R) \
2043 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2044 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2047#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2048 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2049 (__mmask16)(U), (int)(R)))
2051#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2052 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2053 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2057_mm512_cvttph_epu32(__m256h __A) {
2058 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2064_mm512_mask_cvttph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
2065 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2070_mm512_maskz_cvttph_epu32(
__mmask16 __U, __m256h __A) {
2071 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2076#define _mm512_cvt_roundepi64_ph(A, R) \
2077 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2078 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2080#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2081 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2082 (__mmask8)(U), (int)(R)))
2084#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2085 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2086 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2089_mm512_cvtepi64_ph(__m512i __A) {
2090 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2091 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2096_mm512_mask_cvtepi64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2097 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2102_mm512_maskz_cvtepi64_ph(
__mmask8 __U, __m512i __A) {
2103 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2104 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2108#define _mm512_cvt_roundph_epi64(A, R) \
2109 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2110 (__v8di)_mm512_undefined_epi32(), \
2111 (__mmask8)(-1), (int)(R)))
2113#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2114 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2115 (__mmask8)(U), (int)(R)))
2117#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2118 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2119 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2122_mm512_cvtph_epi64(__m128h __A) {
2123 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2129_mm512_mask_cvtph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2130 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2135_mm512_maskz_cvtph_epi64(
__mmask8 __U, __m128h __A) {
2136 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2141#define _mm512_cvt_roundepu64_ph(A, R) \
2142 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2143 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2145#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2146 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2147 (__mmask8)(U), (int)(R)))
2149#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2150 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2151 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2154_mm512_cvtepu64_ph(__m512i __A) {
2155 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2156 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2161_mm512_mask_cvtepu64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2162 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2167_mm512_maskz_cvtepu64_ph(
__mmask8 __U, __m512i __A) {
2168 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2169 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2173#define _mm512_cvt_roundph_epu64(A, R) \
2174 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2175 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2178#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2179 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2180 (__mmask8)(U), (int)(R)))
2182#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2183 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2184 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2187_mm512_cvtph_epu64(__m128h __A) {
2188 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2194_mm512_mask_cvtph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2195 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2200_mm512_maskz_cvtph_epu64(
__mmask8 __U, __m128h __A) {
2201 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2206#define _mm512_cvtt_roundph_epi64(A, R) \
2207 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2208 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2211#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2212 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2213 (__mmask8)(U), (int)(R)))
2215#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2216 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2217 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2220_mm512_cvttph_epi64(__m128h __A) {
2221 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2227_mm512_mask_cvttph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2228 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2233_mm512_maskz_cvttph_epi64(
__mmask8 __U, __m128h __A) {
2234 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2239#define _mm512_cvtt_roundph_epu64(A, R) \
2240 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2241 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2244#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2245 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2246 (__mmask8)(U), (int)(R)))
2248#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2249 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2250 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2253_mm512_cvttph_epu64(__m128h __A) {
2254 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2260_mm512_mask_cvttph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2261 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2266_mm512_maskz_cvttph_epu64(
__mmask8 __U, __m128h __A) {
2267 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2272#define _mm_cvt_roundsh_i32(A, R) \
2273 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2279#define _mm_cvt_roundsh_u32(A, R) \
2280 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2283_mm_cvtsh_u32(__m128h __A) {
2284 return (
unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2289#define _mm_cvt_roundsh_i64(A, R) \
2290 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2293 return (
long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2297#define _mm_cvt_roundsh_u64(A, R) \
2298 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2301_mm_cvtsh_u64(__m128h __A) {
2302 return (
unsigned long long)__builtin_ia32_vcvtsh2usi64(
2307#define _mm_cvt_roundu32_sh(A, B, R) \
2308 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2311_mm_cvtu32_sh(__m128h __A,
unsigned int __B) {
2317#define _mm_cvt_roundu64_sh(A, B, R) \
2318 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2322_mm_cvtu64_sh(__m128h __A,
unsigned long long __B) {
2328#define _mm_cvt_roundi32_sh(A, B, R) \
2329 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2338#define _mm_cvt_roundi64_sh(A, B, R) \
2339 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2348#define _mm_cvtt_roundsh_i32(A, R) \
2349 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2352 return (
int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2357#define _mm_cvtt_roundsh_i64(A, R) \
2358 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2361 return (
long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2366#define _mm_cvtt_roundsh_u32(A, R) \
2367 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2370_mm_cvttsh_u32(__m128h __A) {
2371 return (
unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2376#define _mm_cvtt_roundsh_u64(A, R) \
2377 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2380_mm_cvttsh_u64(__m128h __A) {
2381 return (
unsigned long long)__builtin_ia32_vcvttsh2usi64(
2386#define _mm512_cvtx_roundph_ps(A, R) \
2387 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2388 (__v16sf)_mm512_undefined_ps(), \
2389 (__mmask16)(-1), (int)(R)))
2391#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2392 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2393 (__mmask16)(U), (int)(R)))
2395#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2396 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2397 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2400 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2406_mm512_mask_cvtxph_ps(__m512 __W,
__mmask16 __U, __m256h __A) {
2407 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2412_mm512_maskz_cvtxph_ps(
__mmask16 __U, __m256h __A) {
2413 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2418#define _mm512_cvtx_roundps_ph(A, R) \
2419 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2420 (__v16hf)_mm256_undefined_ph(), \
2421 (__mmask16)(-1), (int)(R)))
2423#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2424 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2425 (__mmask16)(U), (int)(R)))
2427#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2428 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2429 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2432 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2433 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
2438_mm512_mask_cvtxps_ph(__m256h __W,
__mmask16 __U, __m512 __A) {
2439 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2444_mm512_maskz_cvtxps_ph(
__mmask16 __U, __m512 __A) {
2445 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2446 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2450#define _mm512_fmadd_round_ph(A, B, C, R) \
2451 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2452 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2453 (__mmask32)-1, (int)(R)))
2455#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2456 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2457 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2458 (__mmask32)(U), (int)(R)))
2460#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2461 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2462 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2463 (__mmask32)(U), (int)(R)))
2465#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2466 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2467 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2468 (__mmask32)(U), (int)(R)))
2470#define _mm512_fmsub_round_ph(A, B, C, R) \
2471 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2472 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2473 (__mmask32)-1, (int)(R)))
2475#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2476 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2477 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2478 (__mmask32)(U), (int)(R)))
2480#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2481 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2482 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2483 (__mmask32)(U), (int)(R)))
2485#define _mm512_fnmadd_round_ph(A, B, C, R) \
2486 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2487 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2488 (__mmask32)-1, (int)(R)))
2490#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2491 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2492 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2493 (__mmask32)(U), (int)(R)))
2495#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2496 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2497 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2498 (__mmask32)(U), (int)(R)))
2500#define _mm512_fnmsub_round_ph(A, B, C, R) \
2501 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2502 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2503 (__mmask32)-1, (int)(R)))
2505#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2506 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2507 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2508 (__mmask32)(U), (int)(R)))
2513 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2519_mm512_mask_fmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2520 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2526_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2527 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2533_mm512_maskz_fmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2534 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2542 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2548_mm512_mask_fmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2549 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2555_mm512_maskz_fmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2556 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2557 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2564 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2570_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2571 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2577_mm512_maskz_fnmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2578 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2586 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2592_mm512_maskz_fnmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2593 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2594 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2598#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2599 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2600 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2601 (__mmask32)-1, (int)(R)))
2603#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2604 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2605 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2606 (__mmask32)(U), (int)(R)))
2608#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2609 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2610 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2611 (__mmask32)(U), (int)(R)))
2613#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2614 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2615 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2616 (__mmask32)(U), (int)(R)))
2618#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2619 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2620 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2621 (__mmask32)-1, (int)(R)))
2623#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2624 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2625 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2626 (__mmask32)(U), (int)(R)))
2628#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2629 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2630 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2631 (__mmask32)(U), (int)(R)))
2634_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2635 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2636 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)-1,
2641_mm512_mask_fmaddsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2642 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2643 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2648_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2649 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2650 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2655_mm512_maskz_fmaddsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2656 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2657 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2662_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2663 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2664 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)-1,
2669_mm512_mask_fmsubadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2670 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2671 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2676_mm512_maskz_fmsubadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2677 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2678 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2682#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2683 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2684 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2685 (__mmask32)(U), (int)(R)))
2688_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2689 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2694#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2695 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2696 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2697 (__mmask32)(U), (int)(R)))
2700_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2701 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2702 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2706#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2707 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2708 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2709 (__mmask32)(U), (int)(R)))
2712_mm512_mask_fnmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2713 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2718#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2719 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2720 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2721 (__mmask32)(U), (int)(R)))
2723#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2724 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2725 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2726 (__mmask32)(U), (int)(R)))
2729_mm512_mask_fnmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2730 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2736_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2737 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2745 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2753 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2757#define _mm_fmadd_round_sh(A, B, C, R) \
2758 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2759 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2760 (__mmask8)-1, (int)(R)))
2762#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2763 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2764 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2765 (__mmask8)(U), (int)(R)))
2768_mm_maskz_fmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2769 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2774#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2775 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2776 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2777 (__mmask8)(U), (int)(R)))
2780_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2781 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2786#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2787 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2788 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2789 (__mmask8)(U), (int)(R)))
2794 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2803 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2808#define _mm_fmsub_round_sh(A, B, C, R) \
2809 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2810 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2811 (__mmask8)-1, (int)(R)))
2813#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2814 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2815 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2816 (__mmask8)(U), (int)(R)))
2819_mm_maskz_fmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2820 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2825#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2826 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2827 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2828 (__mmask8)(U), (int)R))
2831_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2832 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2837#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2838 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2839 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2840 (__mmask8)(U), (int)(R)))
2845 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2850_mm_mask_fnmadd_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2851 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2855#define _mm_fnmadd_round_sh(A, B, C, R) \
2856 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2857 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2858 (__mmask8)-1, (int)(R)))
2860#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2861 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2862 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2863 (__mmask8)(U), (int)(R)))
2866_mm_maskz_fnmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2867 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2872#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2873 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2874 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2875 (__mmask8)(U), (int)(R)))
2878_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2879 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2884#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2885 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2886 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2887 (__mmask8)(U), (int)(R)))
2892 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2897_mm_mask_fnmsub_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2898 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2902#define _mm_fnmsub_round_sh(A, B, C, R) \
2903 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2904 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2905 (__mmask8)-1, (int)(R)))
2907#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2908 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2909 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2910 (__mmask8)(U), (int)(R)))
2913_mm_maskz_fnmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2914 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2919#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2920 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2921 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2922 (__mmask8)(U), (int)(R)))
2925_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2926 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2931#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2932 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2933 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2934 (__mmask8)(U), (int)(R)))
2939 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2945_mm_mask_fcmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
2946 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2951_mm_maskz_fcmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2952 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2958_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
2959 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2963#define _mm_fcmadd_round_sch(A, B, C, R) \
2964 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2965 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2966 (__mmask8)-1, (int)(R)))
2968#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2969 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2970 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2971 (__mmask8)(U), (int)(R)))
2973#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2974 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2975 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2976 (__mmask8)(U), (int)(R)))
2978#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2979 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2980 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2981 (__mmask8)(U), (int)(R)))
2986 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2992_mm_mask_fmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
2993 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2998_mm_maskz_fmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2999 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3005_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
3006 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3010#define _mm_fmadd_round_sch(A, B, C, R) \
3011 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3012 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3013 (__mmask8)-1, (int)(R)))
3015#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3016 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3017 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3018 (__mmask8)(U), (int)(R)))
3020#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3021 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3022 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3023 (__mmask8)(U), (int)(R)))
3025#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3026 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3027 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3028 (__mmask8)(U), (int)(R)))
3032 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3033 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3038_mm_mask_fcmul_sch(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
3039 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3045_mm_maskz_fcmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3046 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3047 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3051#define _mm_fcmul_round_sch(A, B, R) \
3052 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3053 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3054 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3056#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3057 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3058 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3059 (__mmask8)(U), (int)(R)))
3061#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3062 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3063 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3064 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3068 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3069 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3077 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3083_mm_maskz_fmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3084 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3085 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3089#define _mm_fmul_round_sch(A, B, R) \
3090 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3091 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3092 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3094#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3095 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3096 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3097 (__mmask8)(U), (int)(R)))
3099#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3100 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3101 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3102 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3106 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3107 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3112_mm512_mask_fcmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3113 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3119_mm512_maskz_fcmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3120 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3121 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3125#define _mm512_fcmul_round_pch(A, B, R) \
3126 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3127 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3128 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3130#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3131 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3132 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3133 (__mmask16)(U), (int)(R)))
3135#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3136 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3137 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3138 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3142 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3143 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3148_mm512_mask_fmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3149 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3155_mm512_maskz_fmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3156 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3157 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3161#define _mm512_fmul_round_pch(A, B, R) \
3162 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3163 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3164 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3166#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3167 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3168 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3169 (__mmask16)(U), (int)(R)))
3171#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3172 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3173 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3174 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3179 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3180 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)-1,
3185_mm512_mask_fcmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3186 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3187 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3192_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3193 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3194 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3199_mm512_maskz_fcmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3200 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3201 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3205#define _mm512_fcmadd_round_pch(A, B, C, R) \
3206 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3207 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3208 (__mmask16)-1, (int)(R)))
3210#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3211 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3212 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3213 (__mmask16)(U), (int)(R)))
3215#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3216 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3217 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3218 (__mmask16)(U), (int)(R)))
3220#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3221 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3222 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3223 (__mmask16)(U), (int)(R)))
3228 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3234_mm512_mask_fmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3235 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3241_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3242 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3243 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3248_mm512_maskz_fmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3249 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3250 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3254#define _mm512_fmadd_round_pch(A, B, C, R) \
3255 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3256 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3257 (__mmask16)-1, (int)(R)))
3259#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3260 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3261 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3262 (__mmask16)(U), (int)(R)))
3264#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3265 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3266 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3267 (__mmask16)(U), (int)(R)))
3269#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3270 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3271 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3272 (__mmask16)(U), (int)(R)))
3275_mm512_reduce_add_ph(__m512h __W) {
3276 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3280_mm512_reduce_mul_ph(__m512h __W) {
3281 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3285_mm512_reduce_max_ph(__m512h __V) {
3286 return __builtin_ia32_reduce_fmax_ph512(__V);
3290_mm512_reduce_min_ph(__m512h __V) {
3291 return __builtin_ia32_reduce_fmin_ph512(__V);
3295_mm512_mask_blend_ph(
__mmask32 __U, __m512h __A, __m512h __W) {
3296 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U, (__v32hf)__W,
3301_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3302 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3307_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3308 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3312#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3313#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3314#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3315#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3316#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3317 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3318#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3319 _mm512_maskz_fmul_round_pch(U, A, B, R)
3321#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3322#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3323#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3324#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3325#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3326 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3327#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3328 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3330#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3331#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3332#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3333#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3334#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3335 _mm_mask_fmul_round_sch(W, U, A, B, R)
3336#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3338#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3339#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3340#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3341#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3342#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3343 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3344#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3345 _mm_maskz_fcmul_round_sch(U, A, B, R)
3347#undef __DEFAULT_FN_ATTRS128
3348#undef __DEFAULT_FN_ATTRS256
3349#undef __DEFAULT_FN_ATTRS512
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
static __inline__ uint32_t volatile uint32_t * __p
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS512
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_xor_ps(__m512 __A, __m512 __B)
static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set1_epi32(int __s)
#define _mm512_setzero_epi32
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
#define _MM_FROUND_CUR_DIRECTION
static __inline__ void int __a
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
struct __storeu_i16 *__P __v
__inline unsigned int unsigned int unsigned int * __P
__inline unsigned int unsigned int __Y
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.