10#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
15#ifndef __AVX10_2NIINTRIN_H
16#define __AVX10_2NIINTRIN_H
18#define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
20 __min_vector_width__(128)))
21#define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
23 __min_vector_width__(256)))
29 return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A,
37 return (__m128)__builtin_ia32_selectps_128(
38 (
__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W);
45 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
46 (__v4sf)_mm_dpph_ps(__W, __A, __B),
53 return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A,
58_mm256_mask_dpph_ps(__m256 __W,
__mmask8 __U, __m256h __A, __m256h __B) {
59 return (__m256)__builtin_ia32_selectps_256(
60 (
__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W);
64_mm256_maskz_dpph_ps(
__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
65 return (__m256)__builtin_ia32_selectps_256(
66 (
__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B),
71#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
72 ((__m128i)__builtin_ia32_selectw_128( \
73 (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
74 (__v8hi)(__m128i)(W)))
76#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
77 ((__m128i)__builtin_ia32_selectw_128( \
78 (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
79 (__v8hi)_mm_setzero_si128()))
81#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
82 ((__m256i)__builtin_ia32_selectw_256( \
83 (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
84 (__v16hi)(__m256i)(W)))
86#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
87 ((__m256i)__builtin_ia32_selectw_256( \
88 (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
89 (__v16hi)_mm256_setzero_si256()))
93_mm_mask_dpbssd_epi32(__m128i __W,
__mmask8 __U, __m128i __A, __m128i __B) {
94 return (__m128i)__builtin_ia32_selectd_128(
99_mm_maskz_dpbssd_epi32(
__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
100 return (__m128i)__builtin_ia32_selectd_128(
106_mm256_mask_dpbssd_epi32(__m256i __W,
__mmask8 __U, __m256i __A, __m256i __B) {
107 return (__m256i)__builtin_ia32_selectd_256(
112_mm256_maskz_dpbssd_epi32(
__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
113 return (__m256i)__builtin_ia32_selectd_256(
119_mm_mask_dpbssds_epi32(__m128i __W,
__mmask8 __U, __m128i __A, __m128i __B) {
120 return (__m128i)__builtin_ia32_selectd_128(
125_mm_maskz_dpbssds_epi32(
__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
126 return (__m128i)__builtin_ia32_selectd_128(
132_mm256_mask_dpbssds_epi32(__m256i __W,
__mmask8 __U, __m256i __A, __m256i __B) {
133 return (__m256i)__builtin_ia32_selectd_256(
138 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
139 return (__m256i)__builtin_ia32_selectd_256(
145_mm_mask_dpbsud_epi32(__m128i __W,
__mmask8 __U, __m128i __A, __m128i __B) {
146 return (__m128i)__builtin_ia32_selectd_128(
151_mm_maskz_dpbsud_epi32(
__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
152 return (__m128i)__builtin_ia32_selectd_128(
158_mm256_mask_dpbsud_epi32(__m256i __W,
__mmask8 __U, __m256i __A, __m256i __B) {
159 return (__m256i)__builtin_ia32_selectd_256(
164_mm256_maskz_dpbsud_epi32(
__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
165 return (__m256i)__builtin_ia32_selectd_256(
171_mm_mask_dpbsuds_epi32(__m128i __W,
__mmask8 __U, __m128i __A, __m128i __B) {
172 return (__m128i)__builtin_ia32_selectd_128(
177_mm_maskz_dpbsuds_epi32(
__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
178 return (__m128i)__builtin_ia32_selectd_128(
184_mm256_mask_dpbsuds_epi32(__m256i __W,
__mmask8 __U, __m256i __A, __m256i __B) {
185 return (__m256i)__builtin_ia32_selectd_256(
190 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
191 return (__m256i)__builtin_ia32_selectd_256(
197_mm_mask_dpbuud_epi32(__m128i __W,
__mmask8 __U, __m128i __A, __m128i __B) {
198 return (__m128i)__builtin_ia32_selectd_128(
203_mm_maskz_dpbuud_epi32(
__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
204 return (__m128i)__builtin_ia32_selectd_128(
210_mm256_mask_dpbuud_epi32(__m256i __W,
__mmask8 __U, __m256i __A, __m256i __B) {
211 return (__m256i)__builtin_ia32_selectd_256(
216_mm256_maskz_dpbuud_epi32(
__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
217 return (__m256i)__builtin_ia32_selectd_256(
223_mm_mask_dpbuuds_epi32(__m128i __W,
__mmask8 __U, __m128i __A, __m128i __B) {
224 return (__m128i)__builtin_ia32_selectd_128(
229_mm_maskz_dpbuuds_epi32(
__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
230 return (__m128i)__builtin_ia32_selectd_128(
236_mm256_mask_dpbuuds_epi32(__m256i __W,
__mmask8 __U, __m256i __A, __m256i __B) {
237 return (__m256i)__builtin_ia32_selectd_256(
242 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
243 return (__m256i)__builtin_ia32_selectd_256(
250_mm_mask_dpwsud_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
251 return (__m128i)__builtin_ia32_selectd_128(
256_mm_maskz_dpwsud_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
257 return (__m128i)__builtin_ia32_selectd_128(
263_mm256_mask_dpwsud_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
264 return (__m256i)__builtin_ia32_selectd_256(
269_mm256_maskz_dpwsud_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
270 return (__m256i)__builtin_ia32_selectd_256(
276_mm_mask_dpwsuds_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
277 return (__m128i)__builtin_ia32_selectd_128(
282_mm_maskz_dpwsuds_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
283 return (__m128i)__builtin_ia32_selectd_128(
289_mm256_mask_dpwsuds_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
290 return (__m256i)__builtin_ia32_selectd_256(
295 __m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
296 return (__m256i)__builtin_ia32_selectd_256(
302_mm_mask_dpwusd_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
303 return (__m128i)__builtin_ia32_selectd_128(
308_mm_maskz_dpwusd_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
309 return (__m128i)__builtin_ia32_selectd_128(
315_mm256_mask_dpwusd_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
316 return (__m256i)__builtin_ia32_selectd_256(
321_mm256_maskz_dpwusd_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
322 return (__m256i)__builtin_ia32_selectd_256(
328_mm_mask_dpwusds_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
329 return (__m128i)__builtin_ia32_selectd_128(
334_mm_maskz_dpwusds_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
335 return (__m128i)__builtin_ia32_selectd_128(
341_mm256_mask_dpwusds_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
342 return (__m256i)__builtin_ia32_selectd_256(
347 __m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
348 return (__m256i)__builtin_ia32_selectd_256(
354_mm_mask_dpwuud_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
355 return (__m128i)__builtin_ia32_selectd_128(
360_mm_maskz_dpwuud_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
361 return (__m128i)__builtin_ia32_selectd_128(
367_mm256_mask_dpwuud_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
368 return (__m256i)__builtin_ia32_selectd_256(
373_mm256_maskz_dpwuud_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
374 return (__m256i)__builtin_ia32_selectd_256(
380_mm_mask_dpwuuds_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
381 return (__m128i)__builtin_ia32_selectd_128(
386_mm_maskz_dpwuuds_epi32(__m128i __A,
__mmask8 __U, __m128i __B, __m128i __C) {
387 return (__m128i)__builtin_ia32_selectd_128(
393_mm256_mask_dpwuuds_epi32(__m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
394 return (__m256i)__builtin_ia32_selectd_256(
399 __m256i __A,
__mmask8 __U, __m256i __B, __m256i __C) {
400 return (__m256i)__builtin_ia32_selectd_256(
406#define _mm256_add_round_pd(A, B, R) \
407 ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
408 (__v4df)(__m256d)(B), (int)(R)))
410#define _mm256_mask_add_round_pd(W, U, A, B, R) \
411 ((__m256d)__builtin_ia32_selectpd_256( \
412 (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
413 (__v4df)(__m256d)(W)))
415#define _mm256_maskz_add_round_pd(U, A, B, R) \
416 ((__m256d)__builtin_ia32_selectpd_256( \
417 (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
418 (__v4df)_mm256_setzero_pd()))
420#define _mm256_add_round_ph(A, B, R) \
421 ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
422 (__v16hf)(__m256h)(B), (int)(R)))
424#define _mm256_mask_add_round_ph(W, U, A, B, R) \
425 ((__m256h)__builtin_ia32_selectph_256( \
426 (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
427 (__v16hf)(__m256h)(W)))
429#define _mm256_maskz_add_round_ph(U, A, B, R) \
430 ((__m256h)__builtin_ia32_selectph_256( \
431 (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
432 (__v16hf)_mm256_setzero_ph()))
434#define _mm256_add_round_ps(A, B, R) \
435 ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
436 (__v8sf)(__m256)(B), (int)(R)))
438#define _mm256_mask_add_round_ps(W, U, A, B, R) \
439 ((__m256)__builtin_ia32_selectps_256( \
440 (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
441 (__v8sf)(__m256)(W)))
443#define _mm256_maskz_add_round_ps(U, A, B, R) \
444 ((__m256)__builtin_ia32_selectps_256( \
445 (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
446 (__v8sf)_mm256_setzero_ps()))
448#define _mm256_cmp_round_pd_mask(A, B, P, R) \
449 ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
450 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1, \
453#define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R) \
454 ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
455 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)(U), \
458#define _mm256_cmp_round_ph_mask(A, B, P, R) \
459 ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
460 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1, \
463#define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R) \
464 ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
465 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)(U), \
468#define _mm256_cmp_round_ps_mask(A, B, P, R) \
469 ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
470 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1, \
473#define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R) \
474 ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
475 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)(U), \
478#define _mm256_cvt_roundepi32_ph(A, R) \
479 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \
480 (__v8si)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
482#define _mm256_mask_cvt_roundepi32_ph(W, U, A, R) \
483 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask((__v8si)(A), (__v8hf)(W), \
484 (__mmask8)(U), (int)(R)))
486#define _mm256_maskz_cvt_roundepi32_ph(U, A, R) \
487 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \
488 (__v8si)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
490#define _mm256_cvt_roundepi32_ps(A, R) \
491 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
492 (__v8sf)_mm256_setzero_ps(), \
493 (__mmask8)-1, (int)(R)))
495#define _mm256_mask_cvt_roundepi32_ps(W, U, A, R) \
496 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask( \
497 (__v8si)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
499#define _mm256_maskz_cvt_roundepi32_ps(U, A, R) \
500 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
501 (__v8sf)_mm256_setzero_ps(), \
502 (__mmask8)(U), (int)(R)))
504#define _mm256_cvt_roundpd_epi32(A, R) \
505 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
506 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
509#define _mm256_mask_cvt_roundpd_epi32(W, U, A, R) \
510 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
511 (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
513#define _mm256_maskz_cvt_roundpd_epi32(U, A, R) \
514 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
515 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \
518#define _mm256_cvt_roundpd_ph(A, R) \
519 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \
520 (__v4df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
522#define _mm256_mask_cvt_roundpd_ph(W, U, A, R) \
523 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask((__v4df)(A), (__v8hf)(W), \
524 (__mmask8)(U), (int)(R)))
526#define _mm256_maskz_cvt_roundpd_ph(U, A, R) \
527 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \
528 (__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
530#define _mm256_cvt_roundpd_ps(A, R) \
531 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
532 (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
534#define _mm256_mask_cvt_roundpd_ps(W, U, A, R) \
535 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
536 (__v4df)(__m256d)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
538#define _mm256_maskz_cvt_roundpd_ps(U, A, R) \
539 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A), \
540 (__v4sf)_mm_setzero_ps(), \
541 (__mmask8)(U), (int)(R)))
543#define _mm256_cvt_roundpd_epi64(A, R) \
544 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
545 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
548#define _mm256_mask_cvt_roundpd_epi64(W, U, A, R) \
549 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
550 (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
552#define _mm256_maskz_cvt_roundpd_epi64(U, A, R) \
553 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
554 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
557#define _mm256_cvt_roundpd_epu32(A, R) \
558 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
559 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
562#define _mm256_mask_cvt_roundpd_epu32(W, U, A, R) \
563 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
564 (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
566#define _mm256_maskz_cvt_roundpd_epu32(U, A, R) \
567 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
568 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \
571#define _mm256_cvt_roundpd_epu64(A, R) \
572 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
573 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
576#define _mm256_mask_cvt_roundpd_epu64(W, U, A, R) \
577 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
578 (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
580#define _mm256_maskz_cvt_roundpd_epu64(U, A, R) \
581 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
582 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
585#define _mm256_cvt_roundph_epi32(A, R) \
586 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \
587 (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \
590#define _mm256_mask_cvt_roundph_epi32(W, U, A, R) \
591 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \
592 (__mmask8)(U), (int)(R)))
594#define _mm256_maskz_cvt_roundph_epi32(U, A, R) \
595 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \
596 (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
598#define _mm256_cvt_roundph_pd(A, R) \
599 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \
600 (__v8hf)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)(-1), (int)(R)))
602#define _mm256_mask_cvt_roundph_pd(W, U, A, R) \
603 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask((__v8hf)(A), (__v4df)(W), \
604 (__mmask8)(U), (int)(R)))
606#define _mm256_maskz_cvt_roundph_pd(U, A, R) \
607 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \
608 (__v8hf)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
610#define _mm256_cvtx_roundph_ps(A, R) \
611 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \
612 (__v8hf)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)(-1), (int)(R)))
614#define _mm256_mask_cvtx_roundph_ps(W, U, A, R) \
615 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask((__v8hf)(A), (__v8sf)(W), \
616 (__mmask8)(U), (int)(R)))
618#define _mm256_maskz_cvtx_roundph_ps(U, A, R) \
619 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \
620 (__v8hf)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
622#define _mm256_cvt_roundph_epi64(A, R) \
623 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \
624 (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \
627#define _mm256_mask_cvt_roundph_epi64(W, U, A, R) \
628 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \
629 (__mmask8)(U), (int)(R)))
631#define _mm256_maskz_cvt_roundph_epi64(U, A, R) \
632 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \
633 (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
635#define _mm256_cvt_roundph_epu32(A, R) \
636 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \
637 (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \
640#define _mm256_mask_cvt_roundph_epu32(W, U, A, R) \
641 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
642 (__mmask8)(U), (int)(R)))
644#define _mm256_maskz_cvt_roundph_epu32(U, A, R) \
645 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \
646 (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
648#define _mm256_cvt_roundph_epu64(A, R) \
649 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \
650 (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \
653#define _mm256_mask_cvt_roundph_epu64(W, U, A, R) \
654 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
655 (__mmask8)(U), (int)(R)))
657#define _mm256_maskz_cvt_roundph_epu64(U, A, R) \
658 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \
659 (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
661#define _mm256_cvt_roundph_epu16(A, R) \
662 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \
663 (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \
666#define _mm256_mask_cvt_roundph_epu16(W, U, A, R) \
667 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask((__v16hf)(A), (__v16hu)(W), \
668 (__mmask16)(U), (int)(R)))
670#define _mm256_maskz_cvt_roundph_epu16(U, A, R) \
671 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \
672 (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
675#define _mm256_cvt_roundph_epi16(A, R) \
676 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \
677 (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \
680#define _mm256_mask_cvt_roundph_epi16(W, U, A, R) \
681 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
682 (__mmask16)(U), (int)(R)))
684#define _mm256_maskz_cvt_roundph_epi16(U, A, R) \
685 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \
686 (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \
689#define _mm256_cvt_roundps_epi32(A, R) \
690 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
691 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
694#define _mm256_mask_cvt_roundps_epi32(W, U, A, R) \
695 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
696 (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
698#define _mm256_maskz_cvt_roundps_epi32(U, A, R) \
699 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
700 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
703#define _mm256_cvt_roundps_pd(A, R) \
704 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
705 (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
708#define _mm256_mask_cvt_roundps_pd(W, U, A, R) \
709 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
710 (__v4sf)(__m128)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
712#define _mm256_maskz_cvt_roundps_pd(U, A, R) \
713 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
714 (__v4sf)(__m128)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
717#define _mm256_cvt_roundps_ph(A, I) \
718 ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
719 (__v8hi)_mm_undefined_si128(), \
735#define _mm256_cvtx_roundps_ph(A, R) \
736 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \
737 (__v8sf)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
739#define _mm256_mask_cvtx_roundps_ph(W, U, A, R) \
740 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask((__v8sf)(A), (__v8hf)(W), \
741 (__mmask8)(U), (int)(R)))
743#define _mm256_maskz_cvtx_roundps_ph(U, A, R) \
744 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \
745 (__v8sf)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
747#define _mm256_cvt_roundps_epi64(A, R) \
748 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
749 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
752#define _mm256_mask_cvt_roundps_epi64(W, U, A, R) \
753 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
754 (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
756#define _mm256_maskz_cvt_roundps_epi64(U, A, R) \
757 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
758 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
761#define _mm256_cvt_roundps_epu32(A, R) \
762 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
763 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
766#define _mm256_mask_cvt_roundps_epu32(W, U, A, R) \
767 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
768 (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
770#define _mm256_maskz_cvt_roundps_epu32(U, A, R) \
771 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
772 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
775#define _mm256_cvt_roundps_epu64(A, R) \
776 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
777 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
780#define _mm256_mask_cvt_roundps_epu64(W, U, A, R) \
781 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
782 (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
784#define _mm256_maskz_cvt_roundps_epu64(U, A, R) \
785 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
786 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
789#define _mm256_cvt_roundepi64_pd(A, R) \
790 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
791 (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
794#define _mm256_mask_cvt_roundepi64_pd(W, U, A, R) \
795 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
796 (__v4di)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
798#define _mm256_maskz_cvt_roundepi64_pd(U, A, R) \
799 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
800 (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
803#define _mm256_cvt_roundepi64_ph(A, R) \
804 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \
805 (__v4di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
807#define _mm256_mask_cvt_roundepi64_ph(W, U, A, R) \
808 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask((__v4di)(A), (__v8hf)(W), \
809 (__mmask8)(U), (int)(R)))
811#define _mm256_maskz_cvt_roundepi64_ph(U, A, R) \
812 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \
813 (__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
815#define _mm256_cvt_roundepi64_ps(A, R) \
816 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
817 (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
819#define _mm256_mask_cvt_roundepi64_ps(W, U, A, R) \
820 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
821 (__v4di)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
823#define _mm256_maskz_cvt_roundepi64_ps(U, A, R) \
824 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A), \
825 (__v4sf)_mm_setzero_ps(), \
826 (__mmask8)(U), (int)(R)))
828#define _mm256_cvtt_roundpd_epi32(A, R) \
829 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
830 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
833#define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R) \
834 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
835 (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
837#define _mm256_maskz_cvtt_roundpd_epi32(U, A, R) \
838 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
839 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \
842#define _mm256_cvtt_roundpd_epi64(A, R) \
843 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
844 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
847#define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R) \
848 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
849 (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
851#define _mm256_maskz_cvtt_roundpd_epi64(U, A, R) \
852 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
853 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
856#define _mm256_cvtt_roundpd_epu32(A, R) \
857 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
858 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
861#define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R) \
862 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
863 (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
865#define _mm256_maskz_cvtt_roundpd_epu32(U, A, R) \
866 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
867 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \
870#define _mm256_cvtt_roundpd_epu64(A, R) \
871 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
872 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
875#define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R) \
876 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
877 (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
879#define _mm256_maskz_cvtt_roundpd_epu64(U, A, R) \
880 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
881 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
884#define _mm256_cvtt_roundph_epi32(A, R) \
885 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \
886 (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \
889#define _mm256_mask_cvtt_roundph_epi32(W, U, A, R) \
890 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \
891 (__mmask8)(U), (int)(R)))
893#define _mm256_maskz_cvtt_roundph_epi32(U, A, R) \
894 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \
895 (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
897#define _mm256_cvtt_roundph_epi64(A, R) \
898 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \
899 (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \
902#define _mm256_mask_cvtt_roundph_epi64(W, U, A, R) \
903 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \
904 (__mmask8)(U), (int)(R)))
906#define _mm256_maskz_cvtt_roundph_epi64(U, A, R) \
907 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \
908 (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
910#define _mm256_cvtt_roundph_epu32(A, R) \
911 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \
912 (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \
915#define _mm256_mask_cvtt_roundph_epu32(W, U, A, R) \
916 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
917 (__mmask8)(U), (int)(R)))
919#define _mm256_maskz_cvtt_roundph_epu32(U, A, R) \
920 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \
921 (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
923#define _mm256_cvtt_roundph_epu64(A, R) \
924 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \
925 (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \
928#define _mm256_mask_cvtt_roundph_epu64(W, U, A, R) \
929 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
930 (__mmask8)(U), (int)(R)))
932#define _mm256_maskz_cvtt_roundph_epu64(U, A, R) \
933 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \
934 (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
936#define _mm256_cvtt_roundph_epu16(A, R) \
937 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
938 (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \
941#define _mm256_mask_cvtt_roundph_epu16(W, U, A, R) \
942 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
943 (__v16hf)(A), (__v16hu)(W), (__mmask16)(U), (int)(R)))
945#define _mm256_maskz_cvtt_roundph_epu16(U, A, R) \
946 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
947 (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
950#define _mm256_cvtt_roundph_epi16(A, R) \
951 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \
952 (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \
955#define _mm256_mask_cvtt_roundph_epi16(W, U, A, R) \
956 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
957 (__mmask16)(U), (int)(R)))
959#define _mm256_maskz_cvtt_roundph_epi16(U, A, R) \
960 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \
961 (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \
964#define _mm256_cvtt_roundps_epi32(A, R) \
965 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
966 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
969#define _mm256_mask_cvtt_roundps_epi32(W, U, A, R) \
970 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
971 (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
973#define _mm256_maskz_cvtt_roundps_epi32(U, A, R) \
974 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
975 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
978#define _mm256_cvtt_roundps_epi64(A, R) \
979 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
980 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
983#define _mm256_mask_cvtt_roundps_epi64(W, U, A, R) \
984 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
985 (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
987#define _mm256_maskz_cvtt_roundps_epi64(U, A, R) \
988 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
989 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
992#define _mm256_cvtt_roundps_epu32(A, R) \
993 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
994 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
997#define _mm256_mask_cvtt_roundps_epu32(W, U, A, R) \
998 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
999 (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
1001#define _mm256_maskz_cvtt_roundps_epu32(U, A, R) \
1002 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
1003 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
1006#define _mm256_cvtt_roundps_epu64(A, R) \
1007 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1008 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
1011#define _mm256_mask_cvtt_roundps_epu64(W, U, A, R) \
1012 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1013 (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
1015#define _mm256_maskz_cvtt_roundps_epu64(U, A, R) \
1016 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1017 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
1020#define _mm256_cvt_roundepu32_ph(A, R) \
1021 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \
1022 (__v8su)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1024#define _mm256_mask_cvt_roundepu32_ph(W, U, A, R) \
1025 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask((__v8su)(A), (__v8hf)(W), \
1026 (__mmask8)(U), (int)(R)))
1028#define _mm256_maskz_cvt_roundepu32_ph(U, A, R) \
1029 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \
1030 (__v8su)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1032#define _mm256_cvt_roundepu32_ps(A, R) \
1033 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1034 (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, \
1037#define _mm256_mask_cvt_roundepu32_ps(W, U, A, R) \
1038 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1039 (__v8su)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1041#define _mm256_maskz_cvt_roundepu32_ps(U, A, R) \
1042 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1043 (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), \
1046#define _mm256_cvt_roundepu64_pd(A, R) \
1047 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1048 (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
1051#define _mm256_mask_cvt_roundepu64_pd(W, U, A, R) \
1052 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1053 (__v4du)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1055#define _mm256_maskz_cvt_roundepu64_pd(U, A, R) \
1056 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1057 (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
1060#define _mm256_cvt_roundepu64_ph(A, R) \
1061 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \
1062 (__v4du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1064#define _mm256_mask_cvt_roundepu64_ph(W, U, A, R) \
1065 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask((__v4du)(A), (__v8hf)(W), \
1066 (__mmask8)(U), (int)(R)))
1068#define _mm256_maskz_cvt_roundepu64_ph(U, A, R) \
1069 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \
1070 (__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1072#define _mm256_cvt_roundepu64_ps(A, R) \
1073 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
1074 (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
1076#define _mm256_mask_cvt_roundepu64_ps(W, U, A, R) \
1077 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
1078 (__v4du)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
1080#define _mm256_maskz_cvt_roundepu64_ps(U, A, R) \
1081 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A), \
1082 (__v4sf)_mm_setzero_ps(), \
1083 (__mmask8)(U), (int)(R)))
1085#define _mm256_cvt_roundepu16_ph(A, R) \
1086 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \
1087 (__v16hu)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \
1090#define _mm256_mask_cvt_roundepu16_ph(W, U, A, R) \
1091 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask((__v16hu)(A), (__v16hf)(W), \
1092 (__mmask16)(U), (int)(R)))
1094#define _mm256_maskz_cvt_roundepu16_ph(U, A, R) \
1095 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \
1096 (__v16hu)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1098#define _mm256_cvt_roundepi16_ph(A, R) \
1099 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \
1100 (__v16hi)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \
1103#define _mm256_mask_cvt_roundepi16_ph(W, U, A, R) \
1104 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask((__v16hi)(A), (__v16hf)(W), \
1105 (__mmask16)(U), (int)(R)))
1107#define _mm256_maskz_cvt_roundepi16_ph(U, A, R) \
1108 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \
1109 (__v16hi)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1111#define _mm256_div_round_pd(A, B, R) \
1112 ((__m256d)__builtin_ia32_vdivpd256_round((__v4df)(__m256d)(A), \
1113 (__v4df)(__m256d)(B), (int)(R)))
1115#define _mm256_mask_div_round_pd(W, U, A, B, R) \
1116 ((__m256d)__builtin_ia32_selectpd_256( \
1117 (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \
1118 (__v4df)(__m256d)(W)))
1120#define _mm256_maskz_div_round_pd(U, A, B, R) \
1121 ((__m256d)__builtin_ia32_selectpd_256( \
1122 (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \
1123 (__v4df)_mm256_setzero_pd()))
1125#define _mm256_div_round_ph(A, B, R) \
1126 ((__m256h)__builtin_ia32_vdivph256_round((__v16hf)(__m256h)(A), \
1127 (__v16hf)(__m256h)(B), (int)(R)))
1129#define _mm256_mask_div_round_ph(W, U, A, B, R) \
1130 ((__m256h)__builtin_ia32_selectph_256( \
1131 (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \
1132 (__v16hf)(__m256h)(W)))
1134#define _mm256_maskz_div_round_ph(U, A, B, R) \
1135 ((__m256h)__builtin_ia32_selectph_256( \
1136 (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \
1137 (__v16hf)_mm256_setzero_ph()))
1139#define _mm256_div_round_ps(A, B, R) \
1140 ((__m256)__builtin_ia32_vdivps256_round((__v8sf)(__m256)(A), \
1141 (__v8sf)(__m256)(B), (int)(R)))
1143#define _mm256_mask_div_round_ps(W, U, A, B, R) \
1144 ((__m256)__builtin_ia32_selectps_256( \
1145 (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \
1146 (__v8sf)(__m256)(W)))
1148#define _mm256_maskz_div_round_ps(U, A, B, R) \
1149 ((__m256)__builtin_ia32_selectps_256( \
1150 (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \
1151 (__v8sf)_mm256_setzero_ps()))
1153#define _mm256_fcmadd_round_pch(A, B, C, R) \
1154 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
1155 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1156 (__mmask8)-1, (int)(R)))
1158#define _mm256_mask_fcmadd_round_pch(A, U, B, C, R) \
1159 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask( \
1160 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1161 (__mmask8)(U), (int)(R)))
1163#define _mm256_mask3_fcmadd_round_pch(A, B, C, U, R) \
1164 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
1165 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1166 (__mmask8)(U), (int)(R)))
1168#define _mm256_maskz_fcmadd_round_pch(U, A, B, C, R) \
1169 ((__m256h)__builtin_ia32_vfcmaddcph256_round_maskz( \
1170 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1171 (__mmask8)(U), (int)(R)))
1173#define _mm256_cmul_round_pch(A, B, R) \
1174 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1175 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1176 (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1178#define _mm256_mask_cmul_round_pch(W, U, A, B, R) \
1179 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1180 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \
1181 (__mmask8)(U), (int)(R)))
1183#define _mm256_maskz_cmul_round_pch(U, A, B, R) \
1184 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1185 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1186 (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1188#define _mm256_fixupimm_round_pd(A, B, C, imm, R) \
1189 ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
1190 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1191 (int)(imm), (__mmask8)-1, (int)(R)))
1193#define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
1194 ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
1195 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1196 (int)(imm), (__mmask8)(U), (int)(R)))
1198#define _mm256_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
1199 ((__m256d)__builtin_ia32_vfixupimmpd256_round_maskz( \
1200 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1201 (int)(imm), (__mmask8)(U), (int)(R)))
1203#define _mm256_fixupimm_round_ps(A, B, C, imm, R) \
1204 ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
1205 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1206 (int)(imm), (__mmask8)-1, (int)(R)))
1208#define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
1209 ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
1210 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1211 (int)(imm), (__mmask8)(U), (int)(R)))
1213#define _mm256_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
1214 ((__m256)__builtin_ia32_vfixupimmps256_round_maskz( \
1215 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1216 (int)(imm), (__mmask8)(U), (int)(R)))
1218#define _mm256_fmadd_round_pd(A, B, C, R) \
1219 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1220 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1221 (__mmask8)-1, (int)(R)))
1223#define _mm256_mask_fmadd_round_pd(A, U, B, C, R) \
1224 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1225 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1226 (__mmask8)(U), (int)(R)))
1228#define _mm256_mask3_fmadd_round_pd(A, B, C, U, R) \
1229 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
1230 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1231 (__mmask8)(U), (int)(R)))
1233#define _mm256_maskz_fmadd_round_pd(U, A, B, C, R) \
1234 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1235 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1236 (__mmask8)(U), (int)(R)))
1238#define _mm256_fmsub_round_pd(A, B, C, R) \
1239 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1240 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1241 (__mmask8)-1, (int)(R)))
1243#define _mm256_mask_fmsub_round_pd(A, U, B, C, R) \
1244 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1245 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1246 (__mmask8)(U), (int)(R)))
1248#define _mm256_maskz_fmsub_round_pd(U, A, B, C, R) \
1249 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1250 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1251 (__mmask8)(U), (int)(R)))
1253#define _mm256_fnmadd_round_pd(A, B, C, R) \
1254 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1255 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1256 (__mmask8)-1, (int)(R)))
1258#define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R) \
1259 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
1260 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1261 (__mmask8)(U), (int)(R)))
1263#define _mm256_maskz_fnmadd_round_pd(U, A, B, C, R) \
1264 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1265 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1266 (__mmask8)(U), (int)(R)))
1268#define _mm256_fnmsub_round_pd(A, B, C, R) \
1269 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1270 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1271 (__mmask8)-1, (int)(R)))
1273#define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R) \
1274 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1275 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1276 (__mmask8)(U), (int)(R)))
1278#define _mm256_fmadd_round_ph(A, B, C, R) \
1279 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1280 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1281 (__mmask16)-1, (int)(R)))
1283#define _mm256_mask_fmadd_round_ph(A, U, B, C, R) \
1284 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1285 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1286 (__mmask16)(U), (int)(R)))
1288#define _mm256_mask3_fmadd_round_ph(A, B, C, U, R) \
1289 ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
1290 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1291 (__mmask16)(U), (int)(R)))
1293#define _mm256_maskz_fmadd_round_ph(U, A, B, C, R) \
1294 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1295 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1296 (__mmask16)(U), (int)(R)))
1298#define _mm256_fmsub_round_ph(A, B, C, R) \
1299 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1300 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1301 (__mmask16)-1, (int)(R)))
1303#define _mm256_mask_fmsub_round_ph(A, U, B, C, R) \
1304 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1305 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1306 (__mmask16)(U), (int)(R)))
1308#define _mm256_maskz_fmsub_round_ph(U, A, B, C, R) \
1309 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1310 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1311 (__mmask16)(U), (int)(R)))
1313#define _mm256_fnmadd_round_ph(A, B, C, R) \
1314 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1315 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1316 (__mmask16)-1, (int)(R)))
1318#define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R) \
1319 ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
1320 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1321 (__mmask16)(U), (int)(R)))
1323#define _mm256_maskz_fnmadd_round_ph(U, A, B, C, R) \
1324 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1325 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1326 (__mmask16)(U), (int)(R)))
1328#define _mm256_fnmsub_round_ph(A, B, C, R) \
1329 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1330 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1331 (__mmask16)-1, (int)(R)))
1333#define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R) \
1334 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1335 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1336 (__mmask16)(U), (int)(R)))
1338#define _mm256_fmadd_round_ps(A, B, C, R) \
1339 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1340 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1341 (__mmask8)-1, (int)(R)))
1343#define _mm256_mask_fmadd_round_ps(A, U, B, C, R) \
1344 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1345 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1346 (__mmask8)(U), (int)(R)))
1348#define _mm256_mask3_fmadd_round_ps(A, B, C, U, R) \
1349 ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
1350 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1351 (__mmask8)(U), (int)(R)))
1353#define _mm256_maskz_fmadd_round_ps(U, A, B, C, R) \
1354 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1355 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1356 (__mmask8)(U), (int)(R)))
1358#define _mm256_fmsub_round_ps(A, B, C, R) \
1359 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1360 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1361 (__mmask8)-1, (int)(R)))
1363#define _mm256_mask_fmsub_round_ps(A, U, B, C, R) \
1364 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1365 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1366 (__mmask8)(U), (int)(R)))
1368#define _mm256_maskz_fmsub_round_ps(U, A, B, C, R) \
1369 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1370 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1371 (__mmask8)(U), (int)(R)))
1373#define _mm256_fnmadd_round_ps(A, B, C, R) \
1374 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1375 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1376 (__mmask8)-1, (int)(R)))
1378#define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R) \
1379 ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
1380 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1381 (__mmask8)(U), (int)(R)))
1383#define _mm256_maskz_fnmadd_round_ps(U, A, B, C, R) \
1384 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1385 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1386 (__mmask8)(U), (int)(R)))
1388#define _mm256_fnmsub_round_ps(A, B, C, R) \
1389 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1390 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1391 (__mmask8)-1, (int)(R)))
1393#define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R) \
1394 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1395 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1396 (__mmask8)(U), (int)(R)))
1398#define _mm256_fmadd_round_pch(A, B, C, R) \
1399 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
1400 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1401 (__mmask8)-1, (int)(R)))
1403#define _mm256_mask_fmadd_round_pch(A, U, B, C, R) \
1404 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask( \
1405 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1406 (__mmask8)(U), (int)(R)))
1408#define _mm256_mask3_fmadd_round_pch(A, B, C, U, R) \
1409 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
1410 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1411 (__mmask8)(U), (int)(R)))
1413#define _mm256_maskz_fmadd_round_pch(U, A, B, C, R) \
1414 ((__m256h)__builtin_ia32_vfmaddcph256_round_maskz( \
1415 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1416 (__mmask8)(U), (int)(R)))
1418#define _mm256_fmaddsub_round_pd(A, B, C, R) \
1419 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1420 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1421 (__mmask8)-1, (int)(R)))
1423#define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R) \
1424 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1425 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1426 (__mmask8)(U), (int)(R)))
1428#define _mm256_mask3_fmaddsub_round_pd(A, B, C, U, R) \
1429 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask3( \
1430 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1431 (__mmask8)(U), (int)(R)))
1433#define _mm256_maskz_fmaddsub_round_pd(U, A, B, C, R) \
1434 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \
1435 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1436 (__mmask8)(U), (int)(R)))
1438#define _mm256_fmsubadd_round_pd(A, B, C, R) \
1439 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1440 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1441 (__mmask8)-1, (int)(R)))
1443#define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R) \
1444 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1445 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1446 (__mmask8)(U), (int)(R)))
1448#define _mm256_maskz_fmsubadd_round_pd(U, A, B, C, R) \
1449 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \
1450 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1451 (__mmask8)(U), (int)(R)))
1453#define _mm256_fmaddsub_round_ph(A, B, C, R) \
1454 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1455 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1456 (__mmask16)-1, (int)(R)))
1458#define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R) \
1459 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1460 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1461 (__mmask16)(U), (int)(R)))
1463#define _mm256_mask3_fmaddsub_round_ph(A, B, C, U, R) \
1464 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask3( \
1465 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1466 (__mmask16)(U), (int)(R)))
1468#define _mm256_maskz_fmaddsub_round_ph(U, A, B, C, R) \
1469 ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \
1470 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1471 (__mmask16)(U), (int)(R)))
1473#define _mm256_fmsubadd_round_ph(A, B, C, R) \
1474 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1475 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1476 (__mmask16)-1, (int)(R)))
1478#define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R) \
1479 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1480 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1481 (__mmask16)(U), (int)(R)))
1483#define _mm256_maskz_fmsubadd_round_ph(U, A, B, C, R) \
1484 ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \
1485 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1486 (__mmask16)(U), (int)(R)))
1488#define _mm256_fmaddsub_round_ps(A, B, C, R) \
1489 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1490 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1491 (__mmask8)-1, (int)(R)))
1493#define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R) \
1494 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1495 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1496 (__mmask8)(U), (int)(R)))
1498#define _mm256_mask3_fmaddsub_round_ps(A, B, C, U, R) \
1499 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask3( \
1500 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1501 (__mmask8)(U), (int)(R)))
1503#define _mm256_maskz_fmaddsub_round_ps(U, A, B, C, R) \
1504 ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \
1505 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1506 (__mmask8)(U), (int)(R)))
1508#define _mm256_fmsubadd_round_ps(A, B, C, R) \
1509 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1510 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1511 (__mmask8)-1, (int)(R)))
1513#define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R) \
1514 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1515 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1516 (__mmask8)(U), (int)(R)))
1518#define _mm256_maskz_fmsubadd_round_ps(U, A, B, C, R) \
1519 ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \
1520 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1521 (__mmask8)(U), (int)(R)))
1522#define _mm256_mask3_fmsub_round_pd(A, B, C, U, R) \
1523 ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \
1524 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1525 (__mmask8)(U), (int)(R)))
1527#define _mm256_mask3_fmsubadd_round_pd(A, B, C, U, R) \
1528 ((__m256d)__builtin_ia32_vfmsubaddpd256_round_mask3( \
1529 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1530 (__mmask8)(U), (int)(R)))
1532#define _mm256_mask_fnmadd_round_pd(A, U, B, C, R) \
1533 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1534 (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1535 (__mmask8)(U), (int)(R)))
1537#define _mm256_mask_fnmsub_round_pd(A, U, B, C, R) \
1538 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1539 (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1540 (__mmask8)(U), (int)(R)))
1542#define _mm256_mask3_fnmsub_round_pd(A, B, C, U, R) \
1543 ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \
1544 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1545 (__mmask8)(U), (int)(R)))
1547#define _mm256_mask3_fmsub_round_ph(A, B, C, U, R) \
1548 ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \
1549 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1550 (__mmask16)(U), (int)(R)))
1552#define _mm256_mask3_fmsubadd_round_ph(A, B, C, U, R) \
1553 ((__m256h)__builtin_ia32_vfmsubaddph256_round_mask3( \
1554 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1555 (__mmask16)(U), (int)(R)))
1557#define _mm256_mask_fnmadd_round_ph(A, U, B, C, R) \
1558 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1559 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1560 (__mmask16)(U), (int)(R)))
1562#define _mm256_mask_fnmsub_round_ph(A, U, B, C, R) \
1563 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1564 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1565 (__mmask16)(U), (int)(R)))
1567#define _mm256_mask3_fnmsub_round_ph(A, B, C, U, R) \
1568 ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \
1569 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1570 (__mmask16)(U), (int)(R)))
1572#define _mm256_mask3_fmsub_round_ps(A, B, C, U, R) \
1573 ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \
1574 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1575 (__mmask8)(U), (int)(R)))
1577#define _mm256_mask3_fmsubadd_round_ps(A, B, C, U, R) \
1578 ((__m256)__builtin_ia32_vfmsubaddps256_round_mask3( \
1579 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1580 (__mmask8)(U), (int)(R)))
1582#define _mm256_mask_fnmadd_round_ps(A, U, B, C, R) \
1583 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1584 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1585 (__mmask8)(U), (int)(R)))
1587#define _mm256_mask_fnmsub_round_ps(A, U, B, C, R) \
1588 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1589 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1590 (__mmask8)(U), (int)(R)))
1592#define _mm256_mask3_fnmsub_round_ps(A, B, C, U, R) \
1593 ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \
1594 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1595 (__mmask8)(U), (int)(R)))
1597#define _mm256_mul_round_pch(A, B, R) \
1598 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1599 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1600 (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1602#define _mm256_mask_mul_round_pch(W, U, A, B, R) \
1603 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1604 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \
1605 (__mmask8)(U), (int)(R)))
1607#define _mm256_maskz_mul_round_pch(U, A, B, R) \
1608 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1609 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1610 (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1612#define _mm256_getexp_round_pd(A, R) \
1613 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1614 (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
1617#define _mm256_mask_getexp_round_pd(W, U, A, R) \
1618 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1619 (__v4df)(__m256d)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1621#define _mm256_maskz_getexp_round_pd(U, A, R) \
1622 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1623 (__v4df)(__m256d)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
1626#define _mm256_getexp_round_ph(A, R) \
1627 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1628 (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, \
1631#define _mm256_mask_getexp_round_ph(W, U, A, R) \
1632 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1633 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
1635#define _mm256_maskz_getexp_round_ph(U, A, R) \
1636 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1637 (__v16hf)(__m256h)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), \
1640#define _mm256_getexp_round_ps(A, R) \
1641 ((__m256)__builtin_ia32_vgetexpps256_round_mask( \
1642 (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, \
1645#define _mm256_mask_getexp_round_ps(W, U, A, R) \
1646 ((__m256)__builtin_ia32_vgetexpps256_round_mask( \
1647 (__v8sf)(__m256)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1649#define _mm256_maskz_getexp_round_ps(U, A, R) \
1650 ((__m256)__builtin_ia32_vgetexpps256_round_mask((__v8sf)(__m256)(A), \
1651 (__v8sf)_mm256_setzero_ps(), \
1652 (__mmask8)(U), (int)(R)))
1654#define _mm256_getmant_round_pd(A, B, C, R) \
1655 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1656 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
1657 (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1659#define _mm256_mask_getmant_round_pd(W, U, A, B, C, R) \
1660 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1661 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W), \
1662 (__mmask8)(U), (int)(R)))
1664#define _mm256_maskz_getmant_round_pd(U, A, B, C, R) \
1665 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1666 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
1667 (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1669#define _mm256_getmant_round_ph(A, B, C, R) \
1670 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1671 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
1672 (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1674#define _mm256_mask_getmant_round_ph(W, U, A, B, C, R) \
1675 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1676 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
1677 (__mmask16)(U), (int)(R)))
1679#define _mm256_maskz_getmant_round_ph(U, A, B, C, R) \
1680 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1681 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
1682 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1684#define _mm256_getmant_round_ps(A, B, C, R) \
1685 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1686 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
1687 (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
1689#define _mm256_mask_getmant_round_ps(W, U, A, B, C, R) \
1690 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1691 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W), \
1692 (__mmask8)(U), (int)(R)))
1694#define _mm256_maskz_getmant_round_ps(U, A, B, C, R) \
1695 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1696 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
1697 (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1699#define _mm256_max_round_pd(A, B, R) \
1700 ((__m256d)__builtin_ia32_vmaxpd256_round((__v4df)(__m256d)(A), \
1701 (__v4df)(__m256d)(B), (int)(R)))
1703#define _mm256_mask_max_round_pd(W, U, A, B, R) \
1704 ((__m256d)__builtin_ia32_selectpd_256( \
1705 (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \
1706 (__v4df)(__m256d)(W)))
1708#define _mm256_maskz_max_round_pd(U, A, B, R) \
1709 ((__m256d)__builtin_ia32_selectpd_256( \
1710 (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \
1711 (__v4df)_mm256_setzero_pd()))
1713#define _mm256_max_round_ph(A, B, R) \
1714 ((__m256h)__builtin_ia32_vmaxph256_round((__v16hf)(__m256h)(A), \
1715 (__v16hf)(__m256h)(B), (int)(R)))
1717#define _mm256_mask_max_round_ph(W, U, A, B, R) \
1718 ((__m256h)__builtin_ia32_selectph_256( \
1719 (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \
1720 (__v16hf)(__m256h)(W)))
1722#define _mm256_maskz_max_round_ph(U, A, B, R) \
1723 ((__m256h)__builtin_ia32_selectph_256( \
1724 (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \
1725 (__v16hf)_mm256_setzero_ph()))
1727#define _mm256_max_round_ps(A, B, R) \
1728 ((__m256)__builtin_ia32_vmaxps256_round((__v8sf)(__m256)(A), \
1729 (__v8sf)(__m256)(B), (int)(R)))
1731#define _mm256_mask_max_round_ps(W, U, A, B, R) \
1732 ((__m256)__builtin_ia32_selectps_256( \
1733 (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \
1734 (__v8sf)(__m256)(W)))
1736#define _mm256_maskz_max_round_ps(U, A, B, R) \
1737 ((__m256)__builtin_ia32_selectps_256( \
1738 (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \
1739 (__v8sf)_mm256_setzero_ps()))
1741#define _mm256_min_round_pd(A, B, R) \
1742 ((__m256d)__builtin_ia32_vminpd256_round((__v4df)(__m256d)(A), \
1743 (__v4df)(__m256d)(B), (int)(R)))
1745#define _mm256_mask_min_round_pd(W, U, A, B, R) \
1746 ((__m256d)__builtin_ia32_selectpd_256( \
1747 (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \
1748 (__v4df)(__m256d)(W)))
1750#define _mm256_maskz_min_round_pd(U, A, B, R) \
1751 ((__m256d)__builtin_ia32_selectpd_256( \
1752 (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \
1753 (__v4df)_mm256_setzero_pd()))
1755#define _mm256_min_round_ph(A, B, R) \
1756 ((__m256h)__builtin_ia32_vminph256_round((__v16hf)(__m256h)(A), \
1757 (__v16hf)(__m256h)(B), (int)(R)))
1759#define _mm256_mask_min_round_ph(W, U, A, B, R) \
1760 ((__m256h)__builtin_ia32_selectph_256( \
1761 (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \
1762 (__v16hf)(__m256h)(W)))
1764#define _mm256_maskz_min_round_ph(U, A, B, R) \
1765 ((__m256h)__builtin_ia32_selectph_256( \
1766 (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \
1767 (__v16hf)_mm256_setzero_ph()))
1769#define _mm256_min_round_ps(A, B, R) \
1770 ((__m256)__builtin_ia32_vminps256_round((__v8sf)(__m256)(A), \
1771 (__v8sf)(__m256)(B), (int)(R)))
1773#define _mm256_mask_min_round_ps(W, U, A, B, R) \
1774 ((__m256)__builtin_ia32_selectps_256( \
1775 (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \
1776 (__v8sf)(__m256)(W)))
1778#define _mm256_maskz_min_round_ps(U, A, B, R) \
1779 ((__m256)__builtin_ia32_selectps_256( \
1780 (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \
1781 (__v8sf)_mm256_setzero_ps()))
1783#define _mm256_mul_round_pd(A, B, R) \
1784 ((__m256d)__builtin_ia32_vmulpd256_round((__v4df)(__m256d)(A), \
1785 (__v4df)(__m256d)(B), (int)(R)))
1787#define _mm256_mask_mul_round_pd(W, U, A, B, R) \
1788 ((__m256d)__builtin_ia32_selectpd_256( \
1789 (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \
1790 (__v4df)(__m256d)(W)))
1792#define _mm256_maskz_mul_round_pd(U, A, B, R) \
1793 ((__m256d)__builtin_ia32_selectpd_256( \
1794 (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \
1795 (__v4df)_mm256_setzero_pd()))
1797#define _mm256_mul_round_ph(A, B, R) \
1798 ((__m256h)__builtin_ia32_vmulph256_round((__v16hf)(__m256h)(A), \
1799 (__v16hf)(__m256h)(B), (int)(R)))
1801#define _mm256_mask_mul_round_ph(W, U, A, B, R) \
1802 ((__m256h)__builtin_ia32_selectph_256( \
1803 (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \
1804 (__v16hf)(__m256h)(W)))
1806#define _mm256_maskz_mul_round_ph(U, A, B, R) \
1807 ((__m256h)__builtin_ia32_selectph_256( \
1808 (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \
1809 (__v16hf)_mm256_setzero_ph()))
1811#define _mm256_mul_round_ps(A, B, R) \
1812 ((__m256)__builtin_ia32_vmulps256_round((__v8sf)(__m256)(A), \
1813 (__v8sf)(__m256)(B), (int)(R)))
1815#define _mm256_mask_mul_round_ps(W, U, A, B, R) \
1816 ((__m256)__builtin_ia32_selectps_256( \
1817 (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \
1818 (__v8sf)(__m256)(W)))
1820#define _mm256_maskz_mul_round_ps(U, A, B, R) \
1821 ((__m256)__builtin_ia32_selectps_256( \
1822 (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \
1823 (__v8sf)_mm256_setzero_ps()))
1825#define _mm256_range_round_pd(A, B, C, R) \
1826 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1827 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1828 (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R)))
1830#define _mm256_mask_range_round_pd(W, U, A, B, C, R) \
1831 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1832 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1833 (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1835#define _mm256_maskz_range_round_pd(U, A, B, C, R) \
1836 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1837 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1838 (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1840#define _mm256_range_round_ps(A, B, C, R) \
1841 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1842 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
1843 (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R)))
1845#define _mm256_mask_range_round_ps(W, U, A, B, C, R) \
1846 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1847 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
1848 (__mmask8)(U), (int)(R)))
1850#define _mm256_maskz_range_round_ps(U, A, B, C, R) \
1851 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1852 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
1853 (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1855#define _mm256_reduce_round_pd(A, B, R) \
1856 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1857 (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
1858 (__mmask8)-1, (int)(R)))
1860#define _mm256_mask_reduce_round_pd(W, U, A, B, R) \
1861 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1862 (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U), \
1865#define _mm256_maskz_reduce_round_pd(U, A, B, R) \
1866 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1867 (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
1868 (__mmask8)(U), (int)(R)))
1870#define _mm256_mask_reduce_round_ph(W, U, A, imm, R) \
1871 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1872 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
1873 (__mmask16)(U), (int)(R)))
1875#define _mm256_maskz_reduce_round_ph(U, A, imm, R) \
1876 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1877 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
1878 (__mmask16)(U), (int)(R)))
1880#define _mm256_reduce_round_ph(A, imm, R) \
1881 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1882 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
1883 (__mmask16)-1, (int)(R)))
1885#define _mm256_reduce_round_ps(A, B, R) \
1886 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1887 (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
1888 (__mmask8)-1, (int)(R)))
1890#define _mm256_mask_reduce_round_ps(W, U, A, B, R) \
1891 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1892 (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U), \
1895#define _mm256_maskz_reduce_round_ps(U, A, B, R) \
1896 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1897 (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
1898 (__mmask8)(U), (int)(R)))
1900#define _mm256_roundscale_round_pd(A, imm, R) \
1901 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1902 (__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(), \
1903 (__mmask8)-1, (int)(R)))
1905#define _mm256_mask_roundscale_round_pd(A, B, C, imm, R) \
1906 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1907 (__v4df)(__m256d)(C), (int)(imm), (__v4df)(__m256d)(A), (__mmask8)(B), \
1910#define _mm256_maskz_roundscale_round_pd(A, B, imm, R) \
1911 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1912 (__v4df)(__m256d)(B), (int)(imm), (__v4df)_mm256_setzero_pd(), \
1913 (__mmask8)(A), (int)(R)))
1915#define _mm256_roundscale_round_ph(A, imm, R) \
1916 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1917 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
1918 (__mmask16)-1, (int)(R)))
1920#define _mm256_mask_roundscale_round_ph(A, B, C, imm, R) \
1921 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1922 (__v16hf)(__m256h)(C), (int)(imm), (__v16hf)(__m256h)(A), \
1923 (__mmask16)(B), (int)(R)))
1925#define _mm256_maskz_roundscale_round_ph(A, B, imm, R) \
1926 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1927 (__v16hf)(__m256h)(B), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
1928 (__mmask16)(A), (int)(R)))
1930#define _mm256_roundscale_round_ps(A, imm, R) \
1931 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1932 (__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(), \
1933 (__mmask8)-1, (int)(R)))
1935#define _mm256_mask_roundscale_round_ps(A, B, C, imm, R) \
1936 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1937 (__v8sf)(__m256)(C), (int)(imm), (__v8sf)(__m256)(A), (__mmask8)(B), \
1940#define _mm256_maskz_roundscale_round_ps(A, B, imm, R) \
1941 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1942 (__v8sf)(__m256)(B), (int)(imm), (__v8sf)_mm256_setzero_ps(), \
1943 (__mmask8)(A), (int)(R)))
1945#define _mm256_scalef_round_pd(A, B, R) \
1946 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1947 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), \
1948 (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1950#define _mm256_mask_scalef_round_pd(W, U, A, B, R) \
1951 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1952 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(W), \
1953 (__mmask8)(U), (int)(R)))
1955#define _mm256_maskz_scalef_round_pd(U, A, B, R) \
1956 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1957 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)_mm256_setzero_pd(), \
1958 (__mmask8)(U), (int)(R)))
1960#define _mm256_scalef_round_ph(A, B, R) \
1961 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1962 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
1963 (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1965#define _mm256_mask_scalef_round_ph(W, U, A, B, R) \
1966 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1967 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(W), \
1968 (__mmask16)(U), (int)(R)))
1970#define _mm256_maskz_scalef_round_ph(U, A, B, R) \
1971 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1972 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
1973 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1975#define _mm256_scalef_round_ps(A, B, R) \
1976 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1977 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \
1978 (__mmask8)-1, (int)(R)))
1980#define _mm256_mask_scalef_round_ps(W, U, A, B, R) \
1981 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1982 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(W), \
1983 (__mmask8)(U), (int)(R)))
1985#define _mm256_maskz_scalef_round_ps(U, A, B, R) \
1986 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1987 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_setzero_ps(), \
1988 (__mmask8)(U), (int)(R)))
1990#define _mm256_sqrt_round_pd(A, R) \
1991 ((__m256d)__builtin_ia32_vsqrtpd256_round((__v4df)(__m256d)(A), (int)(R)))
1993#define _mm256_mask_sqrt_round_pd(W, U, A, R) \
1994 ((__m256d)__builtin_ia32_selectpd_256( \
1995 (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \
1996 (__v4df)(__m256d)(W)))
1998#define _mm256_maskz_sqrt_round_pd(U, A, R) \
1999 ((__m256d)__builtin_ia32_selectpd_256( \
2000 (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \
2001 (__v4df)_mm256_setzero_pd()))
2003#define _mm256_sqrt_round_ph(A, R) \
2004 ((__m256h)__builtin_ia32_vsqrtph256_round((__v16hf)(__m256h)(A), (int)(R)))
2006#define _mm256_mask_sqrt_round_ph(W, U, A, R) \
2007 ((__m256h)__builtin_ia32_selectph_256( \
2008 (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \
2009 (__v16hf)(__m256h)(W)))
2011#define _mm256_maskz_sqrt_round_ph(U, A, R) \
2012 ((__m256h)__builtin_ia32_selectph_256( \
2013 (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \
2014 (__v16hf)_mm256_setzero_ph()))
2016#define _mm256_sqrt_round_ps(A, R) \
2017 ((__m256)__builtin_ia32_vsqrtps256_round((__v8sf)(__m256)(A), (int)(R)))
2019#define _mm256_mask_sqrt_round_ps(W, U, A, R) \
2020 ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
2021 (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2022 (__v8sf)(__m256)(W)))
2024#define _mm256_maskz_sqrt_round_ps(U, A, R) \
2025 ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
2026 (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2027 (__v8sf)_mm256_setzero_ps()))
2029#define _mm256_sub_round_pd(A, B, R) \
2030 ((__m256d)__builtin_ia32_vsubpd256_round((__v4df)(__m256d)(A), \
2031 (__v4df)(__m256d)(B), (int)(R)))
2033#define _mm256_mask_sub_round_pd(W, U, A, B, R) \
2034 ((__m256d)__builtin_ia32_selectpd_256( \
2035 (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \
2036 (__v4df)(__m256d)(W)))
2038#define _mm256_maskz_sub_round_pd(U, A, B, R) \
2039 ((__m256d)__builtin_ia32_selectpd_256( \
2040 (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \
2041 (__v4df)_mm256_setzero_pd()))
2043#define _mm256_sub_round_ph(A, B, R) \
2044 ((__m256h)__builtin_ia32_vsubph256_round((__v16hf)(__m256h)(A), \
2045 (__v16hf)(__m256h)(B), (int)(R)))
2047#define _mm256_mask_sub_round_ph(W, U, A, B, R) \
2048 ((__m256h)__builtin_ia32_selectph_256( \
2049 (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \
2050 (__v16hf)(__m256h)(W)))
2052#define _mm256_maskz_sub_round_ph(U, A, B, R) \
2053 ((__m256h)__builtin_ia32_selectph_256( \
2054 (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \
2055 (__v16hf)_mm256_setzero_ph()))
2057#define _mm256_sub_round_ps(A, B, R) \
2058 ((__m256)__builtin_ia32_vsubps256_round((__v8sf)(__m256)(A), \
2059 (__v8sf)(__m256)(B), (int)(R)))
2061#define _mm256_mask_sub_round_ps(W, U, A, B, R) \
2062 ((__m256)__builtin_ia32_selectps_256( \
2063 (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
2064 (__v8sf)(__m256)(W)))
2066#define _mm256_maskz_sub_round_ps(U, A, B, R) \
2067 ((__m256)__builtin_ia32_selectps_256( \
2068 (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
2069 (__v8sf)_mm256_setzero_ps()))
2071#undef __DEFAULT_FN_ATTRS256
2072#undef __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
#define _mm256_dpwuuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm256_dpwsuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm_dpwusd_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm256_dpwusds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwsud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm256_dpwsud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm_dpwusds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwuud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm_dpwsuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm256_dpwusd_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwuuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm256_dpwuud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm_dpbuuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm256_dpbuuds_epi32(__W, __A, __B)
corresponding unsigned 8-bit integers in __B, producing 4 intermediate signed 16-bit results.
#define _mm256_dpbssd_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbsud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm256_dpbuud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm256_dpbsud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm256_dpbssds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbssd_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbssds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm256_dpbsuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm_dpbuud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm_dpbsuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.