11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
16#ifndef __AVX512VLFP16INTRIN_H
17#define __AVX512VLFP16INTRIN_H
20#define __DEFAULT_FN_ATTRS256 \
21 __attribute__((__always_inline__, __nodebug__, \
22 __target__("avx512fp16,avx512vl,no-evex512"), \
23 __min_vector_width__(256)))
24#define __DEFAULT_FN_ATTRS128 \
25 __attribute__((__always_inline__, __nodebug__, \
26 __target__("avx512fp16,avx512vl,no-evex512"), \
27 __min_vector_width__(128)))
38 return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
42 return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
46 return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
47 __h, __h, __h, __h, __h, __h, __h, __h};
53 return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
57_mm256_set1_pch(
_Float16 _Complex h) {
63 return (__m128h)
_mm_set1_ps(__builtin_bit_cast(
float, h));
71 return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
72 __h10, __h9, __h8, __h7, __h6, __h5,
73 __h4, __h3, __h2, __h1};
76#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \
77 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
79#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
81 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \
82 (h7), (h6), (h5), (h4), (h3), (h2), (h1))
86 return (__m256h)((__v16hf)__A + (__v16hf)__B);
90_mm256_mask_add_ph(__m256h __W,
__mmask16 __U, __m256h __A, __m256h __B) {
91 return (__m256h)__builtin_ia32_selectph_256(
92 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
96_mm256_maskz_add_ph(
__mmask16 __U, __m256h __A, __m256h __B) {
97 return (__m256h)__builtin_ia32_selectph_256(
98 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
103 return (__m128h)((__v8hf)__A + (__v8hf)__B);
110 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
117 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
118 (__v8hf)_mm_setzero_ph());
123 return (__m256h)((__v16hf)__A - (__v16hf)__B);
127_mm256_mask_sub_ph(__m256h __W,
__mmask16 __U, __m256h __A, __m256h __B) {
128 return (__m256h)__builtin_ia32_selectph_256(
129 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
133_mm256_maskz_sub_ph(
__mmask16 __U, __m256h __A, __m256h __B) {
134 return (__m256h)__builtin_ia32_selectph_256(
135 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
140 return (__m128h)((__v8hf)__A - (__v8hf)__B);
147 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
154 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
155 (__v8hf)_mm_setzero_ph());
160 return (__m256h)((__v16hf)__A * (__v16hf)__B);
164_mm256_mask_mul_ph(__m256h __W,
__mmask16 __U, __m256h __A, __m256h __B) {
165 return (__m256h)__builtin_ia32_selectph_256(
166 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
170_mm256_maskz_mul_ph(
__mmask16 __U, __m256h __A, __m256h __B) {
171 return (__m256h)__builtin_ia32_selectph_256(
172 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
177 return (__m128h)((__v8hf)__A * (__v8hf)__B);
184 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
191 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
192 (__v8hf)_mm_setzero_ph());
197 return (__m256h)((__v16hf)__A / (__v16hf)__B);
201_mm256_mask_div_ph(__m256h __W,
__mmask16 __U, __m256h __A, __m256h __B) {
202 return (__m256h)__builtin_ia32_selectph_256(
203 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
207_mm256_maskz_div_ph(
__mmask16 __U, __m256h __A, __m256h __B) {
208 return (__m256h)__builtin_ia32_selectph_256(
209 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
214 return (__m128h)((__v8hf)__A / (__v8hf)__B);
221 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
228 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
229 (__v8hf)_mm_setzero_ph());
234 return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
238_mm256_mask_min_ph(__m256h __W,
__mmask16 __U, __m256h __A, __m256h __B) {
239 return (__m256h)__builtin_ia32_selectph_256(
241 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
246_mm256_maskz_min_ph(
__mmask16 __U, __m256h __A, __m256h __B) {
247 return (__m256h)__builtin_ia32_selectph_256(
249 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
250 (__v16hf)_mm256_setzero_ph());
255 return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
262 return (__m128h)__builtin_ia32_selectph_128(
263 (
__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
270 return (__m128h)__builtin_ia32_selectph_128(
271 (
__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
272 (__v8hf)_mm_setzero_ph());
277 return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
281_mm256_mask_max_ph(__m256h __W,
__mmask16 __U, __m256h __A, __m256h __B) {
282 return (__m256h)__builtin_ia32_selectph_256(
284 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
289_mm256_maskz_max_ph(
__mmask16 __U, __m256h __A, __m256h __B) {
290 return (__m256h)__builtin_ia32_selectph_256(
292 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
293 (__v16hf)_mm256_setzero_ph());
298 return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
305 return (__m128h)__builtin_ia32_selectph_128(
306 (
__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
313 return (__m128h)__builtin_ia32_selectph_128(
314 (
__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
315 (__v8hf)_mm_setzero_ph());
331_mm256_mask_conj_pch(__m256h __W,
__mmask8 __U, __m256h __A) {
332 return (__m256h)__builtin_ia32_selectps_256(
333 (
__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
337_mm256_maskz_conj_pch(
__mmask8 __U, __m256h __A) {
338 return (__m256h)__builtin_ia32_selectps_256(
349 return (__m128h)__builtin_ia32_selectps_128(
350 (
__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
354_mm_maskz_conj_pch(
__mmask8 __U, __m128h __A) {
355 return (__m128h)__builtin_ia32_selectps_128(
359#define _mm256_cmp_ph_mask(a, b, p) \
360 ((__mmask16)__builtin_ia32_cmpph256_mask( \
361 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
363#define _mm256_mask_cmp_ph_mask(m, a, b, p) \
364 ((__mmask16)__builtin_ia32_cmpph256_mask( \
365 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
367#define _mm_cmp_ph_mask(a, b, p) \
368 ((__mmask8)__builtin_ia32_cmpph128_mask( \
369 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
371#define _mm_mask_cmp_ph_mask(m, a, b, p) \
372 ((__mmask8)__builtin_ia32_cmpph128_mask( \
373 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
376 return (__m256h)__builtin_ia32_rcpph256_mask(
377 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (
__mmask16)-1);
381_mm256_mask_rcp_ph(__m256h __W,
__mmask16 __U, __m256h __A) {
382 return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
387_mm256_maskz_rcp_ph(
__mmask16 __U, __m256h __A) {
388 return (__m256h)__builtin_ia32_rcpph256_mask(
389 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U);
393 return (__m128h)__builtin_ia32_rcpph128_mask(
394 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
400 return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
406 return (__m128h)__builtin_ia32_rcpph128_mask(
407 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
411 return (__m256h)__builtin_ia32_rsqrtph256_mask(
412 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (
__mmask16)-1);
416_mm256_mask_rsqrt_ph(__m256h __W,
__mmask16 __U, __m256h __A) {
417 return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
422_mm256_maskz_rsqrt_ph(
__mmask16 __U, __m256h __A) {
423 return (__m256h)__builtin_ia32_rsqrtph256_mask(
424 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U);
428 return (__m128h)__builtin_ia32_rsqrtph128_mask(
429 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
435 return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
440_mm_maskz_rsqrt_ph(
__mmask8 __U, __m128h __A) {
441 return (__m128h)__builtin_ia32_rsqrtph128_mask(
442 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
446 return (__m128h)__builtin_ia32_getexpph128_mask(
447 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
451_mm_mask_getexp_ph(__m128h __W,
__mmask8 __U, __m128h __A) {
452 return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
457_mm_maskz_getexp_ph(
__mmask8 __U, __m128h __A) {
458 return (__m128h)__builtin_ia32_getexpph128_mask(
459 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
463 return (__m256h)__builtin_ia32_getexpph256_mask(
464 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1);
468_mm256_mask_getexp_ph(__m256h __W,
__mmask16 __U, __m256h __A) {
469 return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
474_mm256_maskz_getexp_ph(
__mmask16 __U, __m256h __A) {
475 return (__m256h)__builtin_ia32_getexpph256_mask(
476 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U);
479#define _mm_getmant_ph(A, B, C) \
480 ((__m128h)__builtin_ia32_getmantph128_mask( \
481 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
484#define _mm_mask_getmant_ph(W, U, A, B, C) \
485 ((__m128h)__builtin_ia32_getmantph128_mask( \
486 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
489#define _mm_maskz_getmant_ph(U, A, B, C) \
490 ((__m128h)__builtin_ia32_getmantph128_mask( \
491 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
494#define _mm256_getmant_ph(A, B, C) \
495 ((__m256h)__builtin_ia32_getmantph256_mask( \
496 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
497 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
499#define _mm256_mask_getmant_ph(W, U, A, B, C) \
500 ((__m256h)__builtin_ia32_getmantph256_mask( \
501 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
504#define _mm256_maskz_getmant_ph(U, A, B, C) \
505 ((__m256h)__builtin_ia32_getmantph256_mask( \
506 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
507 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
511 return (__m128h)__builtin_ia32_scalefph128_mask(
512 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
516_mm_mask_scalef_ph(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
517 return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
522_mm_maskz_scalef_ph(
__mmask8 __U, __m128h __A, __m128h __B) {
523 return (__m128h)__builtin_ia32_scalefph128_mask(
524 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
529 return (__m256h)__builtin_ia32_scalefph256_mask(
530 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1);
534_mm256_mask_scalef_ph(__m256h __W,
__mmask16 __U, __m256h __A, __m256h __B) {
535 return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
540_mm256_maskz_scalef_ph(
__mmask16 __U, __m256h __A, __m256h __B) {
541 return (__m256h)__builtin_ia32_scalefph256_mask(
542 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U);
545#define _mm_roundscale_ph(A, imm) \
546 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
547 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
550#define _mm_mask_roundscale_ph(W, U, A, imm) \
551 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
552 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
554#define _mm_maskz_roundscale_ph(U, A, imm) \
555 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
556 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
559#define _mm256_roundscale_ph(A, imm) \
560 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
561 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
564#define _mm256_mask_roundscale_ph(W, U, A, imm) \
565 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
566 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
569#define _mm256_maskz_roundscale_ph(U, A, imm) \
570 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
571 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
574#define _mm_reduce_ph(A, imm) \
575 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
576 (__v8hf)_mm_setzero_ph(), \
579#define _mm_mask_reduce_ph(W, U, A, imm) \
580 ((__m128h)__builtin_ia32_reduceph128_mask( \
581 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
583#define _mm_maskz_reduce_ph(U, A, imm) \
584 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
585 (__v8hf)_mm_setzero_ph(), \
588#define _mm256_reduce_ph(A, imm) \
589 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
590 (__v16hf)_mm256_setzero_ph(), \
593#define _mm256_mask_reduce_ph(W, U, A, imm) \
594 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
595 (__v16hf)(__m256h)(W), \
598#define _mm256_maskz_reduce_ph(U, A, imm) \
599 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
600 (__v16hf)_mm256_setzero_ph(), \
604 return __builtin_ia32_sqrtph((__v8hf)
__a);
610 return (__m128h)__builtin_ia32_selectph_128(
611 (
__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
616 return (__m128h)__builtin_ia32_selectph_128(
617 (
__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
621 return (__m256h)__builtin_ia32_sqrtph256((__v16hf)
__a);
625_mm256_mask_sqrt_ph(__m256h __W,
__mmask16 __U, __m256h __A) {
626 return (__m256h)__builtin_ia32_selectph_256(
627 (
__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
631_mm256_maskz_sqrt_ph(
__mmask16 __U, __m256h __A) {
632 return (__m256h)__builtin_ia32_selectph_256((
__mmask16)__U,
633 (__v16hf)_mm256_sqrt_ph(__A),
634 (__v16hf)_mm256_setzero_ph());
637#define _mm_mask_fpclass_ph_mask(U, A, imm) \
638 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
639 (int)(imm), (__mmask8)(U)))
641#define _mm_fpclass_ph_mask(A, imm) \
642 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
643 (int)(imm), (__mmask8)-1))
645#define _mm256_mask_fpclass_ph_mask(U, A, imm) \
646 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
647 (int)(imm), (__mmask16)(U)))
649#define _mm256_fpclass_ph_mask(A, imm) \
650 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
651 (int)(imm), (__mmask16)-1))
654 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
655 (__v2df)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
661 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
666_mm_maskz_cvtpd_ph(
__mmask8 __U, __m128d __A) {
667 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
668 (__v2df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
672 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
673 (__v4df)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
677_mm256_mask_cvtpd_ph(__m128h __W,
__mmask8 __U, __m256d __A) {
678 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
683_mm256_maskz_cvtpd_ph(
__mmask8 __U, __m256d __A) {
684 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
685 (__v4df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
689 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
696 return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
701_mm_maskz_cvtph_pd(
__mmask8 __U, __m128h __A) {
702 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
707 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
712_mm256_mask_cvtph_pd(__m256d __W,
__mmask8 __U, __m128h __A) {
713 return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
718_mm256_maskz_cvtph_pd(
__mmask8 __U, __m128h __A) {
719 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
724 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
729_mm_mask_cvtph_epi16(__m128i __W,
__mmask8 __U, __m128h __A) {
730 return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
735_mm_maskz_cvtph_epi16(
__mmask8 __U, __m128h __A) {
736 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
741_mm256_cvtph_epi16(__m256h __A) {
742 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
747_mm256_mask_cvtph_epi16(__m256i __W,
__mmask16 __U, __m256h __A) {
748 return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
753_mm256_maskz_cvtph_epi16(
__mmask16 __U, __m256h __A) {
754 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
759 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
764_mm_mask_cvttph_epi16(__m128i __W,
__mmask8 __U, __m128h __A) {
765 return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
770_mm_maskz_cvttph_epi16(
__mmask8 __U, __m128h __A) {
771 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
776_mm256_cvttph_epi16(__m256h __A) {
777 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
782_mm256_mask_cvttph_epi16(__m256i __W,
__mmask16 __U, __m256h __A) {
783 return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
788_mm256_maskz_cvttph_epi16(
__mmask16 __U, __m256h __A) {
789 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
794 return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
798_mm_mask_cvtepi16_ph(__m128h __W,
__mmask8 __U, __m128i __A) {
799 return (__m128h)__builtin_ia32_selectph_128(
800 (
__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
804_mm_maskz_cvtepi16_ph(
__mmask8 __U, __m128i __A) {
805 return (__m128h)__builtin_ia32_selectph_128(
806 (
__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
810_mm256_cvtepi16_ph(__m256i __A) {
811 return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
815_mm256_mask_cvtepi16_ph(__m256h __W,
__mmask16 __U, __m256i __A) {
816 return (__m256h)__builtin_ia32_selectph_256(
817 (
__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
821_mm256_maskz_cvtepi16_ph(
__mmask16 __U, __m256i __A) {
822 return (__m256h)__builtin_ia32_selectph_256((
__mmask16)__U,
823 (__v16hf)_mm256_cvtepi16_ph(__A),
824 (__v16hf)_mm256_setzero_ph());
828 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
833_mm_mask_cvtph_epu16(__m128i __W,
__mmask8 __U, __m128h __A) {
834 return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
839_mm_maskz_cvtph_epu16(
__mmask8 __U, __m128h __A) {
840 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
845_mm256_cvtph_epu16(__m256h __A) {
846 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
851_mm256_mask_cvtph_epu16(__m256i __W,
__mmask16 __U, __m256h __A) {
852 return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
857_mm256_maskz_cvtph_epu16(
__mmask16 __U, __m256h __A) {
858 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
863 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
868_mm_mask_cvttph_epu16(__m128i __W,
__mmask8 __U, __m128h __A) {
869 return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
874_mm_maskz_cvttph_epu16(
__mmask8 __U, __m128h __A) {
875 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
880_mm256_cvttph_epu16(__m256h __A) {
881 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
886_mm256_mask_cvttph_epu16(__m256i __W,
__mmask16 __U, __m256h __A) {
887 return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
892_mm256_maskz_cvttph_epu16(
__mmask16 __U, __m256h __A) {
893 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
898 return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
902_mm_mask_cvtepu16_ph(__m128h __W,
__mmask8 __U, __m128i __A) {
903 return (__m128h)__builtin_ia32_selectph_128(
904 (
__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
908_mm_maskz_cvtepu16_ph(
__mmask8 __U, __m128i __A) {
909 return (__m128h)__builtin_ia32_selectph_128(
910 (
__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
914_mm256_cvtepu16_ph(__m256i __A) {
915 return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
919_mm256_mask_cvtepu16_ph(__m256h __W,
__mmask16 __U, __m256i __A) {
920 return (__m256h)__builtin_ia32_selectph_256(
921 (
__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
925_mm256_maskz_cvtepu16_ph(
__mmask16 __U, __m256i __A) {
926 return (__m256h)__builtin_ia32_selectph_256((
__mmask16)__U,
927 (__v16hf)_mm256_cvtepu16_ph(__A),
928 (__v16hf)_mm256_setzero_ph());
932 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
937_mm_mask_cvtph_epi32(__m128i __W,
__mmask8 __U, __m128h __A) {
938 return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
943_mm_maskz_cvtph_epi32(
__mmask8 __U, __m128h __A) {
944 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
949_mm256_cvtph_epi32(__m128h __A) {
950 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
955_mm256_mask_cvtph_epi32(__m256i __W,
__mmask8 __U, __m128h __A) {
956 return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
961_mm256_maskz_cvtph_epi32(
__mmask8 __U, __m128h __A) {
962 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
967 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
972_mm_mask_cvtph_epu32(__m128i __W,
__mmask8 __U, __m128h __A) {
973 return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
978_mm_maskz_cvtph_epu32(
__mmask8 __U, __m128h __A) {
979 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
984_mm256_cvtph_epu32(__m128h __A) {
985 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
990_mm256_mask_cvtph_epu32(__m256i __W,
__mmask8 __U, __m128h __A) {
991 return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
996_mm256_maskz_cvtph_epu32(
__mmask8 __U, __m128h __A) {
997 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
1002 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1003 (__v4si)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
1007_mm_mask_cvtepi32_ph(__m128h __W,
__mmask8 __U, __m128i __A) {
1008 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
1013_mm_maskz_cvtepi32_ph(
__mmask8 __U, __m128i __A) {
1014 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1015 (__v4si)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1019_mm256_cvtepi32_ph(__m256i __A) {
1020 return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
1024_mm256_mask_cvtepi32_ph(__m128h __W,
__mmask8 __U, __m256i __A) {
1025 return (__m128h)__builtin_ia32_selectph_128(
1026 (
__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
1030_mm256_maskz_cvtepi32_ph(
__mmask8 __U, __m256i __A) {
1031 return (__m128h)__builtin_ia32_selectph_128(
1032 (
__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
1036 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1037 (__v4su)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
1041_mm_mask_cvtepu32_ph(__m128h __W,
__mmask8 __U, __m128i __A) {
1042 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
1047_mm_maskz_cvtepu32_ph(
__mmask8 __U, __m128i __A) {
1048 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1049 (__v4su)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1053_mm256_cvtepu32_ph(__m256i __A) {
1054 return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
1058_mm256_mask_cvtepu32_ph(__m128h __W,
__mmask8 __U, __m256i __A) {
1059 return (__m128h)__builtin_ia32_selectph_128(
1060 (
__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
1064_mm256_maskz_cvtepu32_ph(
__mmask8 __U, __m256i __A) {
1065 return (__m128h)__builtin_ia32_selectph_128(
1066 (
__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
1070 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1075_mm_mask_cvttph_epi32(__m128i __W,
__mmask8 __U, __m128h __A) {
1076 return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
1081_mm_maskz_cvttph_epi32(
__mmask8 __U, __m128h __A) {
1082 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1087_mm256_cvttph_epi32(__m128h __A) {
1088 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1093_mm256_mask_cvttph_epi32(__m256i __W,
__mmask8 __U, __m128h __A) {
1094 return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
1099_mm256_maskz_cvttph_epi32(
__mmask8 __U, __m128h __A) {
1100 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1105 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1110_mm_mask_cvttph_epu32(__m128i __W,
__mmask8 __U, __m128h __A) {
1111 return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
1116_mm_maskz_cvttph_epu32(
__mmask8 __U, __m128h __A) {
1117 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1122_mm256_cvttph_epu32(__m128h __A) {
1123 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1128_mm256_mask_cvttph_epu32(__m256i __W,
__mmask8 __U, __m128h __A) {
1129 return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
1134_mm256_maskz_cvttph_epu32(
__mmask8 __U, __m128h __A) {
1135 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1140 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1141 (__v2di)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
1145_mm_mask_cvtepi64_ph(__m128h __W,
__mmask8 __U, __m128i __A) {
1146 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
1151_mm_maskz_cvtepi64_ph(
__mmask8 __U, __m128i __A) {
1152 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1153 (__v2di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1157_mm256_cvtepi64_ph(__m256i __A) {
1158 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1159 (__v4di)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
1163_mm256_mask_cvtepi64_ph(__m128h __W,
__mmask8 __U, __m256i __A) {
1164 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
1169_mm256_maskz_cvtepi64_ph(
__mmask8 __U, __m256i __A) {
1170 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1171 (__v4di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1175 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1180_mm_mask_cvtph_epi64(__m128i __W,
__mmask8 __U, __m128h __A) {
1181 return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
1186_mm_maskz_cvtph_epi64(
__mmask8 __U, __m128h __A) {
1187 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1192_mm256_cvtph_epi64(__m128h __A) {
1193 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1198_mm256_mask_cvtph_epi64(__m256i __W,
__mmask8 __U, __m128h __A) {
1199 return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
1204_mm256_maskz_cvtph_epi64(
__mmask8 __U, __m128h __A) {
1205 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1210 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1211 (__v2du)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
1215_mm_mask_cvtepu64_ph(__m128h __W,
__mmask8 __U, __m128i __A) {
1216 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
1221_mm_maskz_cvtepu64_ph(
__mmask8 __U, __m128i __A) {
1222 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1223 (__v2du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1227_mm256_cvtepu64_ph(__m256i __A) {
1228 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1229 (__v4du)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
1233_mm256_mask_cvtepu64_ph(__m128h __W,
__mmask8 __U, __m256i __A) {
1234 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
1239_mm256_maskz_cvtepu64_ph(
__mmask8 __U, __m256i __A) {
1240 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1241 (__v4du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1245 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1250_mm_mask_cvtph_epu64(__m128i __W,
__mmask8 __U, __m128h __A) {
1251 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1256_mm_maskz_cvtph_epu64(
__mmask8 __U, __m128h __A) {
1257 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1262_mm256_cvtph_epu64(__m128h __A) {
1263 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1268_mm256_mask_cvtph_epu64(__m256i __W,
__mmask8 __U, __m128h __A) {
1269 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1274_mm256_maskz_cvtph_epu64(
__mmask8 __U, __m128h __A) {
1275 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1280 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1285_mm_mask_cvttph_epi64(__m128i __W,
__mmask8 __U, __m128h __A) {
1286 return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
1291_mm_maskz_cvttph_epi64(
__mmask8 __U, __m128h __A) {
1292 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1297_mm256_cvttph_epi64(__m128h __A) {
1298 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1303_mm256_mask_cvttph_epi64(__m256i __W,
__mmask8 __U, __m128h __A) {
1304 return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
1309_mm256_maskz_cvttph_epi64(
__mmask8 __U, __m128h __A) {
1310 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1315 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1320_mm_mask_cvttph_epu64(__m128i __W,
__mmask8 __U, __m128h __A) {
1321 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1326_mm_maskz_cvttph_epu64(
__mmask8 __U, __m128h __A) {
1327 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1332_mm256_cvttph_epu64(__m128h __A) {
1333 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1338_mm256_mask_cvttph_epu64(__m256i __W,
__mmask8 __U, __m128h __A) {
1339 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1344_mm256_maskz_cvttph_epu64(
__mmask8 __U, __m128h __A) {
1345 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1350 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1357 return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
1362_mm_maskz_cvtxph_ps(
__mmask8 __U, __m128h __A) {
1363 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1368 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1373_mm256_mask_cvtxph_ps(__m256 __W,
__mmask8 __U, __m128h __A) {
1374 return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
1379_mm256_maskz_cvtxph_ps(
__mmask8 __U, __m128h __A) {
1380 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1385 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1386 (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
1392 return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
1397_mm_maskz_cvtxps_ph(
__mmask8 __U, __m128 __A) {
1398 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1399 (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1403 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1404 (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1);
1408_mm256_mask_cvtxps_ph(__m128h __W,
__mmask8 __U, __m256 __A) {
1409 return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
1414_mm256_maskz_cvtxps_ph(
__mmask8 __U, __m256 __A) {
1415 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1416 (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1422 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1430 return (__m128h)__builtin_ia32_selectph_128(
1432 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1437_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
1438 return (__m128h)__builtin_ia32_selectph_128(
1440 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1445_mm_maskz_fmadd_ph(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1446 return (__m128h)__builtin_ia32_selectph_128(
1448 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1449 (__v8hf)_mm_setzero_ph());
1455 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1463 return (__m128h)__builtin_ia32_selectph_128(
1464 (
__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1469_mm_maskz_fmsub_ph(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1470 return (__m128h)__builtin_ia32_selectph_128(
1471 (
__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1472 (__v8hf)_mm_setzero_ph());
1476_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
1477 return (__m128h)__builtin_ia32_selectph_128(
1479 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1484_mm_maskz_fnmadd_ph(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1485 return (__m128h)__builtin_ia32_selectph_128(
1487 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1488 (__v8hf)_mm_setzero_ph());
1492_mm_maskz_fnmsub_ph(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1493 return (__m128h)__builtin_ia32_selectph_128(
1495 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1496 (__v8hf)_mm_setzero_ph());
1502 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1507_mm256_mask_fmadd_ph(__m256h __A,
__mmask16 __U, __m256h __B, __m256h __C) {
1508 return (__m256h)__builtin_ia32_selectph_256(
1510 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1515_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C,
__mmask16 __U) {
1516 return (__m256h)__builtin_ia32_selectph_256(
1518 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1523_mm256_maskz_fmadd_ph(
__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1524 return (__m256h)__builtin_ia32_selectph_256(
1526 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1527 (__v16hf)_mm256_setzero_ph());
1533 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1538_mm256_mask_fmsub_ph(__m256h __A,
__mmask16 __U, __m256h __B, __m256h __C) {
1539 return (__m256h)__builtin_ia32_selectph_256(
1541 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1546_mm256_maskz_fmsub_ph(
__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1547 return (__m256h)__builtin_ia32_selectph_256(
1549 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1550 (__v16hf)_mm256_setzero_ph());
1554_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C,
__mmask16 __U) {
1555 return (__m256h)__builtin_ia32_selectph_256(
1557 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1562_mm256_maskz_fnmadd_ph(
__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1563 return (__m256h)__builtin_ia32_selectph_256(
1565 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1566 (__v16hf)_mm256_setzero_ph());
1570_mm256_maskz_fnmsub_ph(
__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1571 return (__m256h)__builtin_ia32_selectph_256(
1573 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1574 (__v16hf)_mm256_setzero_ph());
1580 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1585_mm_mask_fmaddsub_ph(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
1586 return (__m128h)__builtin_ia32_selectph_128(
1588 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1593_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
1594 return (__m128h)__builtin_ia32_selectph_128(
1596 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1601_mm_maskz_fmaddsub_ph(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1602 return (__m128h)__builtin_ia32_selectph_128(
1604 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1605 (__v8hf)_mm_setzero_ph());
1611 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1616_mm_mask_fmsubadd_ph(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
1617 return (__m128h)__builtin_ia32_selectph_128(
1619 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1624_mm_maskz_fmsubadd_ph(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1625 return (__m128h)__builtin_ia32_selectph_128(
1627 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1628 (__v8hf)_mm_setzero_ph());
1632_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
1633 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1638_mm256_mask_fmaddsub_ph(__m256h __A,
__mmask16 __U, __m256h __B, __m256h __C) {
1639 return (__m256h)__builtin_ia32_selectph_256(
1641 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1646_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C,
__mmask16 __U) {
1647 return (__m256h)__builtin_ia32_selectph_256(
1649 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1654_mm256_maskz_fmaddsub_ph(
__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1655 return (__m256h)__builtin_ia32_selectph_256(
1657 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1658 (__v16hf)_mm256_setzero_ph());
1662_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
1663 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1668_mm256_mask_fmsubadd_ph(__m256h __A,
__mmask16 __U, __m256h __B, __m256h __C) {
1669 return (__m256h)__builtin_ia32_selectph_256(
1671 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1676_mm256_maskz_fmsubadd_ph(
__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1677 return (__m256h)__builtin_ia32_selectph_256(
1679 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1680 (__v16hf)_mm256_setzero_ph());
1684_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
1685 return (__m128h)__builtin_ia32_selectph_128(
1687 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1692_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C,
__mmask16 __U) {
1693 return (__m256h)__builtin_ia32_selectph_256(
1695 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1700_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
1701 return (__m128h)__builtin_ia32_selectph_128(
1703 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1708_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C,
__mmask16 __U) {
1709 return (__m256h)__builtin_ia32_selectph_256(
1711 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1718 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1723_mm_mask_fnmadd_ph(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
1724 return (__m128h)__builtin_ia32_selectph_128(
1726 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
1733 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1738_mm256_mask_fnmadd_ph(__m256h __A,
__mmask16 __U, __m256h __B, __m256h __C) {
1739 return (__m256h)__builtin_ia32_selectph_256(
1741 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
1748 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1753_mm_mask_fnmsub_ph(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
1754 return (__m128h)__builtin_ia32_selectph_128(
1756 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1761_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
1762 return (__m128h)__builtin_ia32_selectph_128(
1764 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1771 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1776_mm256_mask_fnmsub_ph(__m256h __A,
__mmask16 __U, __m256h __B, __m256h __C) {
1777 return (__m256h)__builtin_ia32_selectph_256(
1779 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1784_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C,
__mmask16 __U) {
1785 return (__m256h)__builtin_ia32_selectph_256(
1787 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1793 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1794 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1);
1798_mm_mask_fcmul_pch(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1799 return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1804_mm_maskz_fcmul_pch(
__mmask8 __U, __m128h __A, __m128h __B) {
1805 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1806 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U);
1811 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1812 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (
__mmask8)-1);
1816_mm256_mask_fcmul_pch(__m256h __W,
__mmask8 __U, __m256h __A, __m256h __B) {
1817 return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1822_mm256_maskz_fcmul_pch(
__mmask8 __U, __m256h __A, __m256h __B) {
1823 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1824 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (
__mmask8)__U);
1830 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1835_mm_mask_fcmadd_pch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
1836 return (__m128h)__builtin_ia32_selectps_128(
1838 __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B,
1844_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
1845 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1850_mm_maskz_fcmadd_pch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1851 return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
1852 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (
__mmask8)__U);
1858 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1863_mm256_mask_fcmadd_pch(__m256h __A,
__mmask8 __U, __m256h __B, __m256h __C) {
1864 return (__m256h)__builtin_ia32_selectps_256(
1866 __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
1872_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C,
__mmask8 __U) {
1873 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1878_mm256_maskz_fcmadd_pch(
__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1879 return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
1880 (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (
__mmask8)__U);
1885 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1886 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1);
1893 return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1898_mm_maskz_fmul_pch(
__mmask8 __U, __m128h __A, __m128h __B) {
1899 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1900 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U);
1905 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1906 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (
__mmask8)-1);
1910_mm256_mask_fmul_pch(__m256h __W,
__mmask8 __U, __m256h __A, __m256h __B) {
1911 return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1916_mm256_maskz_fmul_pch(
__mmask8 __U, __m256h __A, __m256h __B) {
1917 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1918 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (
__mmask8)__U);
1924 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1929_mm_mask_fmadd_pch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
1930 return (__m128h)__builtin_ia32_selectps_128(
1932 __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
1938_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
1939 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1944_mm_maskz_fmadd_pch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1945 return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B,
1952 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1957_mm256_mask_fmadd_pch(__m256h __A,
__mmask8 __U, __m256h __B, __m256h __C) {
1958 return (__m256h)__builtin_ia32_selectps_256(
1960 __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
1966_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C,
__mmask8 __U) {
1967 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1972_mm256_maskz_fmadd_pch(
__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1973 return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B,
1980 return (__m128h)__builtin_ia32_selectph_128((
__mmask8)__U, (__v8hf)__W,
1985_mm256_mask_blend_ph(
__mmask16 __U, __m256h __A, __m256h __W) {
1986 return (__m256h)__builtin_ia32_selectph_256((
__mmask16)__U, (__v16hf)__W,
1991_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
1992 return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
1997_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
1998 return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
2003_mm_permutexvar_ph(__m128i __A, __m128h __B) {
2004 return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
2008_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
2009 return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
2013_mm256_reduce_add_ph(__m256h __W) {
2014 return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
2018_mm256_reduce_mul_ph(__m256h __W) {
2019 return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
2023_mm256_reduce_max_ph(__m256h __V) {
2024 return __builtin_ia32_reduce_fmax_ph256(__V);
2028_mm256_reduce_min_ph(__m256h __V) {
2029 return __builtin_ia32_reduce_fmin_ph256(__V);
2033_mm_reduce_add_ph(__m128h __W) {
2034 return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
2038_mm_reduce_mul_ph(__m128h __W) {
2039 return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
2043_mm_reduce_max_ph(__m128h __V) {
2044 return __builtin_ia32_reduce_fmax_ph128(__V);
2048_mm_reduce_min_ph(__m128h __V) {
2049 return __builtin_ia32_reduce_fmin_ph128(__V);
2053#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
2054#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
2055#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
2056#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
2057#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
2058#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
2060#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
2061#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
2062#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
2063#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
2064#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
2065#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
2067#undef __DEFAULT_FN_ATTRS128
2068#undef __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_and_epi32(__m256i __a, __m256i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_and_epi32(__m128i __a, __m128i __b)
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
static __inline__ void int __a
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...