clang 20.0.0git
|
Go to the source code of this file.
Macros | |
#define | __DEFAULT_FN_ATTRS256 |
#define | __DEFAULT_FN_ATTRS128 |
#define | _mm256_mpsadbw_epu8(X, Y, M) |
Computes sixteen sum of absolute difference (SAD) operations on sets of four unsigned 8-bit integers from the 256-bit integer vectors X and Y. | |
#define | _mm256_alignr_epi8(a, b, n) |
Uses the lower half of the 256-bit vector a as the upper half of a temporary 256-bit value, and the lower half of the 256-bit vector b as the lower half of the temporary value. | |
#define | _mm256_blend_epi16(V1, V2, M) |
Merges 16-bit integer values from either of the two 256-bit vectors V1 or V2, as specified by the immediate integer operand M, and returns the resulting 256-bit vector of [16 x i16]. | |
#define | _mm256_shuffle_epi32(a, imm) ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) |
Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in a according to control information in the integer literal imm, and returns the 256-bit result. | |
#define | _mm256_shufflehi_epi16(a, imm) ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) |
Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in a according to control information in the integer literal imm, and returns the 256-bit result. | |
#define | _mm256_shufflelo_epi16(a, imm) ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) |
Shuffles 16-bit integers from the 256-bit vector of [16 x i16] a according to control information in the integer literal imm, and returns the 256-bit [16 x i16] result. | |
#define | _mm256_slli_si256(a, imm) ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) |
Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes, and returns the result. | |
#define | _mm256_bslli_epi128(a, imm) ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) |
Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes, and returns the result. | |
#define | _mm256_srli_si256(a, imm) ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) |
Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero bytes, and returns the result. | |
#define | _mm256_bsrli_epi128(a, imm) ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) |
Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero bytes, and returns the result. | |
#define | _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) |
#define | _mm_blend_epi32(V1, V2, M) |
Merges 32-bit integer elements from either of the two 128-bit vectors of [4 x i32] in V1 or V2 to the result's 128-bit vector of [4 x i32], as specified by the immediate integer operand M. | |
#define | _mm256_blend_epi32(V1, V2, M) |
Merges 32-bit integer elements from either of the two 256-bit vectors of [8 x i32] in V1 or V2 to return a 256-bit vector of [8 x i32], as specified by the immediate integer operand M. | |
#define | _mm256_permute4x64_pd(V, M) ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) |
Sets the result's 256-bit vector of [4 x double] to copies of elements of the 256-bit vector of [4 x double] in V as specified by the immediate value M. | |
#define | _mm256_permute4x64_epi64(V, M) ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) |
Sets the result's 256-bit vector of [4 x i64] result to copies of elements of the 256-bit vector of [4 x i64] in V as specified by the immediate value M. | |
#define | _mm256_permute2x128_si256(V1, V2, M) ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) |
Sets each half of the 256-bit result either to zero or to one of the four possible 128-bit halves of the 256-bit vectors V1 and V2, as specified by the immediate value M. | |
#define | _mm256_extracti128_si256(V, M) ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) |
Extracts half of the 256-bit vector V to the 128-bit result. | |
#define | _mm256_inserti128_si256(V1, V2, M) |
Copies the 256-bit vector V1 to the result, then overwrites half of the result with the 128-bit vector V2. | |
#define | _mm_mask_i32gather_pd(a, m, i, mask, s) |
Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm256_mask_i32gather_pd(a, m, i, mask, s) |
Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm_mask_i64gather_pd(a, m, i, mask, s) |
Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i. | |
#define | _mm256_mask_i64gather_pd(a, m, i, mask, s) |
Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x double] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i. | |
#define | _mm_mask_i32gather_ps(a, m, i, mask, s) |
Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm256_mask_i32gather_ps(a, m, i, mask, s) |
Conditionally gathers eight 32-bit floating-point values, either from the 256-bit vector of [8 x float] in a, or from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i. | |
#define | _mm_mask_i64gather_ps(a, m, i, mask, s) |
Conditionally gathers two 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i. | |
#define | _mm256_mask_i64gather_ps(a, m, i, mask, s) |
Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i. | |
#define | _mm_mask_i32gather_epi32(a, m, i, mask, s) |
Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm256_mask_i32gather_epi32(a, m, i, mask, s) |
Conditionally gathers eight 32-bit integer values, either from the 256-bit vector of [8 x i32] in a, or from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i. | |
#define | _mm_mask_i64gather_epi32(a, m, i, mask, s) |
Conditionally gathers two 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i. | |
#define | _mm256_mask_i64gather_epi32(a, m, i, mask, s) |
Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i. | |
#define | _mm_mask_i32gather_epi64(a, m, i, mask, s) |
Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm256_mask_i32gather_epi64(a, m, i, mask, s) |
Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm_mask_i64gather_epi64(a, m, i, mask, s) |
Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i. | |
#define | _mm256_mask_i64gather_epi64(a, m, i, mask, s) |
Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i. | |
#define | _mm_i32gather_pd(m, i, s) |
Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm256_i32gather_pd(m, i, s) |
Gathers four 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm_i64gather_pd(m, i, s) |
Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i. | |
#define | _mm256_i64gather_pd(m, i, s) |
Gathers four 64-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i. | |
#define | _mm_i32gather_ps(m, i, s) |
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm256_i32gather_ps(m, i, s) |
Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i. | |
#define | _mm_i64gather_ps(m, i, s) |
Gathers two 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i. | |
#define | _mm256_i64gather_ps(m, i, s) |
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i. | |
#define | _mm_i32gather_epi32(m, i, s) |
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm256_i32gather_epi32(m, i, s) |
Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i. | |
#define | _mm_i64gather_epi32(m, i, s) |
Gathers two 32-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i. | |
#define | _mm256_i64gather_epi32(m, i, s) |
Gathers four 32-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i. | |
#define | _mm_i32gather_epi64(m, i, s) |
Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm256_i32gather_epi64(m, i, s) |
Gathers four 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i. | |
#define | _mm_i64gather_epi64(m, i, s) |
Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i. | |
#define | _mm256_i64gather_epi64(m, i, s) |
Gathers four 64-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i. | |
Functions | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_abs_epi8 (__m256i __a) |
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each value in the corresponding byte of the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_abs_epi16 (__m256i __a) |
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a and returns each value in the corresponding element of the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_abs_epi32 (__m256i __a) |
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a and returns each value in the corresponding element of the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_packs_epi16 (__m256i __a, __m256i __b) |
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation, and returns the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_packs_epi32 (__m256i __a, __m256i __b) |
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation, and returns the resulting 256-bit vector of [16 x i16]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_packus_epi16 (__m256i __a, __m256i __b) |
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation, and returns the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_packus_epi32 (__m256i __V1, __m256i __V2) |
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation, and returns the resulting 256-bit vector of [16 x i16]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_add_epi8 (__m256i __a, __m256i __b) |
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 bits of each sum in the corresponding byte of the 256-bit integer vector result (overflow is ignored). | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_add_epi16 (__m256i __a, __m256i __b) |
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each sum in the corresponding element of the [16 x i16] result (overflow is ignored). | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_add_epi32 (__m256i __a, __m256i __b) |
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each sum in the corresponding element of the [8 x i32] result (overflow is ignored). | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_add_epi64 (__m256i __a, __m256i __b) |
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the lower 64 bits of each sum in the corresponding element of the [4 x i64] result (overflow is ignored). | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_adds_epi8 (__m256i __a, __m256i __b) |
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation, and returns each sum in the corresponding byte of the 256-bit integer vector result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_adds_epi16 (__m256i __a, __m256i __b) |
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed saturation, and returns the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_adds_epu8 (__m256i __a, __m256i __b) |
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation, and returns each sum in the corresponding byte of the 256-bit integer vector result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_adds_epu16 (__m256i __a, __m256i __b) |
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned saturation, and returns the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_and_si256 (__m256i __a, __m256i __b) |
Computes the bitwise AND of the 256-bit integer vectors in __a and __b. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_andnot_si256 (__m256i __a, __m256i __b) |
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit integer vector in __a. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_avg_epu8 (__m256i __a, __m256i __b) |
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns each average in the corresponding byte of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_avg_epu16 (__m256i __a, __m256i __b) |
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns each average in the corresponding element of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_blendv_epi8 (__m256i __V1, __m256i __V2, __m256i __M) |
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the 256-bit mask __M and returns the resulting 256-bit integer vector. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cmpeq_epi8 (__m256i __a, __m256i __b) |
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns the outcomes in the corresponding bytes of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cmpeq_epi16 (__m256i __a, __m256i __b) |
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cmpeq_epi32 (__m256i __a, __m256i __b) |
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cmpeq_epi64 (__m256i __a, __m256i __b) |
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cmpgt_epi8 (__m256i __a, __m256i __b) |
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than and returns the outcomes in the corresponding bytes of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cmpgt_epi16 (__m256i __a, __m256i __b) |
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cmpgt_epi32 (__m256i __a, __m256i __b) |
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cmpgt_epi64 (__m256i __a, __m256i __b) |
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_hadd_epi16 (__m256i __a, __m256i __b) |
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each sum in an element of the [16 x i16] result (overflow is ignored). | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_hadd_epi32 (__m256i __a, __m256i __b) |
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each sum in an element of the [8 x i32] result (overflow is ignored). | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_hadds_epi16 (__m256i __a, __m256i __b) |
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using signed saturation and returns each sum in an element of the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_hsub_epi16 (__m256i __a, __m256i __b) |
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each difference in an element of the [16 x i16] result (overflow is ignored). | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_hsub_epi32 (__m256i __a, __m256i __b) |
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each difference in an element of the [8 x i32] result (overflow is ignored). | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_hsubs_epi16 (__m256i __a, __m256i __b) |
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using signed saturation and returns each sum in an element of the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_maddubs_epi16 (__m256i __a, __m256i __b) |
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed byte from the 256-bit integer vector in __b, forming signed 16-bit intermediate products. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_madd_epi16 (__m256i __a, __m256i __b) |
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit intermediate products, and adds pairs of those products to form 32-bit sums returned as elements of the [8 x i32] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_max_epi8 (__m256i __a, __m256i __b) |
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns the larger of each pair in the corresponding byte of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_max_epi16 (__m256i __a, __m256i __b) |
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_max_epi32 (__m256i __a, __m256i __b) |
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_max_epu8 (__m256i __a, __m256i __b) |
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns the larger of each pair in the corresponding byte of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_max_epu16 (__m256i __a, __m256i __b) |
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_max_epu32 (__m256i __a, __m256i __b) |
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_min_epi8 (__m256i __a, __m256i __b) |
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns the smaller of each pair in the corresponding byte of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_min_epi16 (__m256i __a, __m256i __b) |
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_min_epi32 (__m256i __a, __m256i __b) |
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_min_epu8 (__m256i __a, __m256i __b) |
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns the smaller of each pair in the corresponding byte of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_min_epu16 (__m256i __a, __m256i __b) |
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_min_epu32 (__m256i __a, __m256i __b) |
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result. | |
static __inline__ int __DEFAULT_FN_ATTRS256 | _mm256_movemask_epi8 (__m256i __a) |
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vector in __a and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepi8_epi16 (__m128i __V) |
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corresponding elements of a 256-bit vector of [16 x i16]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepi8_epi32 (__m128i __V) |
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepi8_epi64 (__m128i __V) |
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepi16_epi32 (__m128i __V) |
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepi16_epi64 (__m128i __V) |
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepi32_epi64 (__m128i __V) |
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepu8_epi16 (__m128i __V) |
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corresponding elements of a 256-bit vector of [16 x i16]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepu8_epi32 (__m128i __V) |
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepu8_epi64 (__m128i __V) |
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepu16_epi32 (__m128i __V) |
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepu16_epi64 (__m128i __V) |
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_cvtepu32_epi64 (__m128i __V) |
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_mul_epi32 (__m256i __a, __m256i __b) |
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and returns the 64-bit products in the [4 x i64] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_mulhrs_epi16 (__m256i __a, __m256i __b) |
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit results to the most significant 18 bits, rounds by adding 1, and returns bits [16:1] of each rounded product in the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_mulhi_epu16 (__m256i __a, __m256i __b) |
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper 16 bits of each 32-bit product in the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_mulhi_epi16 (__m256i __a, __m256i __b) |
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper 16 bits of each 32-bit product in the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_mullo_epi16 (__m256i __a, __m256i __b) |
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower 16 bits of each 32-bit product in the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_mullo_epi32 (__m256i __a, __m256i __b) |
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower 32 bits of each 64-bit product in the [8 x i32] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_mul_epu32 (__m256i __a, __m256i __b) |
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] and returns the 64-bit products in the [4 x i64] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_or_si256 (__m256i __a, __m256i __b) |
Computes the bitwise OR of the 256-bit integer vectors in __a and __b. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sad_epu8 (__m256i __a, __m256i __b) |
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers from the 256-bit integer vectors __a and __b. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_shuffle_epi8 (__m256i __a, __m256i __b) |
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256-bit integer vector __b, and returns the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sign_epi8 (__m256i __a, __m256i __b) |
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a, the negative of that byte, or zero, depending on whether the corresponding byte of the 256-bit integer vector in __b is greater than zero, less than zero, or equal to zero, respectively. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sign_epi16 (__m256i __a, __m256i __b) |
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in __a, the negative of that element, or zero, depending on whether the corresponding element of the 256-bit vector of [16 x i16] in __b is greater than zero, less than zero, or equal to zero, respectively. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sign_epi32 (__m256i __a, __m256i __b) |
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __a, the negative of that element, or zero, depending on whether the corresponding element of the 256-bit vector of [8 x i32] in __b is greater than zero, less than zero, or equal to zero, respectively. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_slli_epi16 (__m256i __a, int __count) |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sll_epi16 (__m256i __a, __m128i __count) |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits specified by the lower 64 bits of __count, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_slli_epi32 (__m256i __a, int __count) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sll_epi32 (__m256i __a, __m128i __count) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_slli_epi64 (__m256i __a, int __count) |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sll_epi64 (__m256i __a, __m128i __count) |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srai_epi16 (__m256i __a, int __count) |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits, shifting in sign bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sra_epi16 (__m256i __a, __m128i __count) |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits given in the lower 64 bits of __count, shifting in sign bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srai_epi32 (__m256i __a, int __count) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits, shifting in sign bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sra_epi32 (__m256i __a, __m128i __count) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits given in the lower 64 bits of __count, shifting in sign bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srli_epi16 (__m256i __a, int __count) |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srl_epi16 (__m256i __a, __m128i __count) |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srli_epi32 (__m256i __a, int __count) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srl_epi32 (__m256i __a, __m128i __count) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srli_epi64 (__m256i __a, int __count) |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srl_epi64 (__m256i __a, __m128i __count) |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sub_epi8 (__m256i __a, __m256i __b) |
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sub_epi16 (__m256i __a, __m256i __b) |
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sub_epi32 (__m256i __a, __m256i __b) |
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sub_epi64 (__m256i __a, __m256i __b) |
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_subs_epi8 (__m256i __a, __m256i __b) |
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation, and returns each differences in the corresponding byte of the 256-bit integer vector result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_subs_epi16 (__m256i __a, __m256i __b) |
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed saturation, and returns each difference in the corresponding element of the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_subs_epu8 (__m256i __a, __m256i __b) |
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation, and returns each difference in the corresponding byte of the 256-bit integer vector result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_subs_epu16 (__m256i __a, __m256i __b) |
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned saturation, and returns each difference in the corresponding element of the [16 x i16] result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_unpackhi_epi8 (__m256i __a, __m256i __b) |
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to form the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_unpackhi_epi16 (__m256i __a, __m256i __b) |
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __b to return the resulting 256-bit vector of [16 x i16]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_unpackhi_epi32 (__m256i __a, __m256i __b) |
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b to return the resulting 256-bit vector of [8 x i32]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_unpackhi_epi64 (__m256i __a, __m256i __b) |
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b to return the resulting 256-bit vector of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_unpacklo_epi8 (__m256i __a, __m256i __b) |
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to form the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_unpacklo_epi16 (__m256i __a, __m256i __b) |
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __b to return the resulting 256-bit vector of [16 x i16]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_unpacklo_epi32 (__m256i __a, __m256i __b) |
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b to return the resulting 256-bit vector of [8 x i32]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_unpacklo_epi64 (__m256i __a, __m256i __b) |
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b to return the resulting 256-bit vector of [4 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_xor_si256 (__m256i __a, __m256i __b) |
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_stream_load_si256 (const void *__V) |
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vector. | |
static __inline__ __m128 __DEFAULT_FN_ATTRS128 | _mm_broadcastss_ps (__m128 __X) |
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] in __X to all elements of the result's 128-bit vector of [4 x float]. | |
static __inline__ __m128d __DEFAULT_FN_ATTRS128 | _mm_broadcastsd_pd (__m128d __a) |
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double] in __a to both elements of the result's 128-bit vector of [2 x double]. | |
static __inline__ __m256 __DEFAULT_FN_ATTRS256 | _mm256_broadcastss_ps (__m128 __X) |
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] in __X to all elements of the result's 256-bit vector of [8 x float]. | |
static __inline__ __m256d __DEFAULT_FN_ATTRS256 | _mm256_broadcastsd_pd (__m128d __X) |
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double] in __X to all elements of the result's 256-bit vector of [4 x double]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_broadcastsi128_si256 (__m128i __X) |
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_broadcastb_epi8 (__m128i __X) |
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_broadcastw_epi16 (__m128i __X) |
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result's 256-bit vector of [16 x i16]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_broadcastd_epi32 (__m128i __X) |
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result's 256-bit vector of [8 x i32]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_broadcastq_epi64 (__m128i __X) |
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result's 256-bit vector of [4 x i64]. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_broadcastb_epi8 (__m128i __X) |
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_broadcastw_epi16 (__m128i __X) |
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result's 128-bit vector of [8 x i16]. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_broadcastd_epi32 (__m128i __X) |
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result's vector of [4 x i32]. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_broadcastq_epi64 (__m128i __X) |
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result's 128-bit vector of [2 x i64]. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_permutevar8x32_epi32 (__m256i __a, __m256i __b) |
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32] in __a as specified by indexes in the elements of the 256-bit vector of [8 x i32] in __b. | |
static __inline__ __m256 __DEFAULT_FN_ATTRS256 | _mm256_permutevar8x32_ps (__m256 __a, __m256i __b) |
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x float] in __a as specified by indexes in the elements of the 256-bit vector of [8 x i32] in __b. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_maskload_epi32 (int const *__X, __m256i __M) |
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_maskload_epi64 (long long const *__X, __m256i __M) |
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_maskload_epi32 (int const *__X, __m128i __M) |
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_maskload_epi64 (long long const *__X, __m128i __M) |
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero. | |
static __inline__ void __DEFAULT_FN_ATTRS256 | _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y) |
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged. | |
static __inline__ void __DEFAULT_FN_ATTRS256 | _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y) |
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged. | |
static __inline__ void __DEFAULT_FN_ATTRS128 | _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y) |
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged. | |
static __inline__ void __DEFAULT_FN_ATTRS128 | _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y) |
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sllv_epi32 (__m256i __X, __m256i __Y) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in zero bits, and returns the result. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_sllv_epi32 (__m128i __X, __m128i __Y) |
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_sllv_epi64 (__m256i __X, __m256i __Y) |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [4 x i64] in __Y, shifting in zero bits, and returns the result. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_sllv_epi64 (__m128i __X, __m128i __Y) |
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [2 x i64] in __Y, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srav_epi32 (__m256i __X, __m256i __Y) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in sign bits, and returns the result. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_srav_epi32 (__m128i __X, __m128i __Y) |
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in sign bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srlv_epi32 (__m256i __X, __m256i __Y) |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in zero bits, and returns the result. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_srlv_epi32 (__m128i __X, __m128i __Y) |
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in zero bits, and returns the result. | |
static __inline__ __m256i __DEFAULT_FN_ATTRS256 | _mm256_srlv_epi64 (__m256i __X, __m256i __Y) |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i64] in __Y, shifting in zero bits, and returns the result. | |
static __inline__ __m128i __DEFAULT_FN_ATTRS128 | _mm_srlv_epi64 (__m128i __X, __m128i __Y) |
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [2 x i64] in __Y, shifting in zero bits, and returns the result. | |
#define __DEFAULT_FN_ATTRS128 |
Definition at line 29 of file avx2intrin.h.
#define __DEFAULT_FN_ATTRS256 |
Definition at line 26 of file avx2intrin.h.
#define _mm256_alignr_epi8 | ( | a, | |
b, | |||
n | |||
) |
Uses the lower half of the 256-bit vector a as the upper half of a temporary 256-bit value, and the lower half of the 256-bit vector b as the lower half of the temporary value.
Right-shifts the temporary value by n bytes, and uses the lower 16 bytes of the shifted value as the lower 16 bytes of the result. Uses the upper halves of a and b to make another temporary value, right shifts by n, and uses the lower 16 bytes of the shifted value as the upper 16 bytes of the result.
This intrinsic corresponds to the VPALIGNR
instruction.
a | A 256-bit integer vector containing source values. |
b | A 256-bit integer vector containing source values. |
n | An immediate value specifying the number of bytes to shift. |
Definition at line 447 of file avx2intrin.h.
#define _mm256_blend_epi16 | ( | V1, | |
V2, | |||
M | |||
) |
Merges 16-bit integer values from either of the two 256-bit vectors V1 or V2, as specified by the immediate integer operand M, and returns the resulting 256-bit vector of [16 x i16].
This intrinsic corresponds to the VPBLENDW
instruction.
V1 | A 256-bit vector of [16 x i16] containing source values. |
V2 | A 256-bit vector of [16 x i16] containing source values. |
M | An immediate 8-bit integer operand, with bits [7:0] specifying the source for each element of the result. The position of the mask bit corresponds to the index of a copied value. When a mask bit is 0, the element is copied from V1; otherwise, it is copied from V2. M[0] determines the source for elements 0 and 8, M[1] for elements 1 and 9, and so forth. |
Definition at line 612 of file avx2intrin.h.
#define _mm256_blend_epi32 | ( | V1, | |
V2, | |||
M | |||
) |
Merges 32-bit integer elements from either of the two 256-bit vectors of [8 x i32] in V1 or V2 to return a 256-bit vector of [8 x i32], as specified by the immediate integer operand M.
This intrinsic corresponds to the VPBLENDDD
instruction.
V1 | A 256-bit vector of [8 x i32] containing source values. |
V2 | A 256-bit vector of [8 x i32] containing source values. |
M | An immediate 8-bit integer operand, with bits [7:0] specifying the source for each element of the result. The position of the mask bit corresponds to the index of a copied value. When a mask bit is 0, the element is copied from V1; otherwise, it is is copied from V2. |
Definition at line 3157 of file avx2intrin.h.
#define _mm256_bslli_epi128 | ( | a, | |
imm | |||
) | ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) |
Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes, and returns the result.
If imm is greater than 15, the returned result is all zeroes.
This intrinsic corresponds to the VPSLLDQ
instruction.
a | A 256-bit integer vector to be shifted. |
imm | An unsigned immediate value specifying the shift count (in bytes). |
Definition at line 2121 of file avx2intrin.h.
#define _mm256_bsrli_epi128 | ( | a, | |
imm | |||
) | ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) |
Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero bytes, and returns the result.
If imm is greater than 15, the returned result is all zeroes.
This intrinsic corresponds to the VPSRLDQ
instruction.
a | A 256-bit integer vector to be shifted. |
imm | An unsigned immediate value specifying the shift count (in bytes). |
Definition at line 2365 of file avx2intrin.h.
#define _mm256_extracti128_si256 | ( | V, | |
M | |||
) | ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) |
Extracts half of the 256-bit vector V to the 128-bit result.
If bit 0 of the immediate M is zero, extracts the lower half of the result; otherwise, extracts the upper half.
This intrinsic corresponds to the VEXTRACTI128
instruction.
V | A 256-bit integer vector containing the source values. |
M | An immediate value specifying which half of V to extract. |
Definition at line 3468 of file avx2intrin.h.
#define _mm256_i32gather_epi32 | ( | m, | |
i, | |||
s | |||
) |
Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.
This intrinsic corresponds to the VPGATHERDD
instruction.
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [8 x i32] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5086 of file avx2intrin.h.
#define _mm256_i32gather_epi64 | ( | m, | |
i, | |||
s | |||
) |
Gathers four 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
This intrinsic corresponds to the VPGATHERDQ
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5218 of file avx2intrin.h.
#define _mm256_i32gather_pd | ( | m, | |
i, | |||
s | |||
) |
Gathers four 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
This intrinsic corresponds to the VGATHERDPD
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4804 of file avx2intrin.h.
#define _mm256_i32gather_ps | ( | m, | |
i, | |||
s | |||
) |
Gathers eight 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.
This intrinsic corresponds to the VGATHERDPS
instruction.
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [8 x i32] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4946 of file avx2intrin.h.
#define _mm256_i64gather_epi32 | ( | m, | |
i, | |||
s | |||
) |
Gathers four 32-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
This intrinsic corresponds to the VPGATHERQD
instruction.
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [4 x i64] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5152 of file avx2intrin.h.
#define _mm256_i64gather_epi64 | ( | m, | |
i, | |||
s | |||
) |
Gathers four 64-bit integer values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
This intrinsic corresponds to the VPGATHERQQ
instruction.
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [4 x i64] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5284 of file avx2intrin.h.
#define _mm256_i64gather_pd | ( | m, | |
i, | |||
s | |||
) |
Gathers four 64-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
This intrinsic corresponds to the VGATHERQPD
instruction.
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [4 x i64] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4875 of file avx2intrin.h.
#define _mm256_i64gather_ps | ( | m, | |
i, | |||
s | |||
) |
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
This intrinsic corresponds to the VGATHERQPS
instruction.
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [4 x i64] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5019 of file avx2intrin.h.
#define _mm256_inserti128_si256 | ( | V1, | |
V2, | |||
M | |||
) |
Copies the 256-bit vector V1 to the result, then overwrites half of the result with the 128-bit vector V2.
If bit 0 of the immediate M is zero, overwrites the lower half of the result; otherwise, overwrites the upper half.
This intrinsic corresponds to the VINSERTI128
instruction.
V1 | A 256-bit integer vector containing a source value. |
V2 | A 128-bit integer vector containing a source value. |
M | An immediate value specifying where to put V2 in the result. |
Definition at line 3491 of file avx2intrin.h.
#define _mm256_mask_i32gather_epi32 | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers eight 32-bit integer values, either from the 256-bit vector of [8 x i32] in a, or from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.
The 256-bit vector of [8 x i32] in mask determines the source for each element.
This intrinsic corresponds to the VPGATHERDD
instruction.
a | A 256-bit vector of [8 x i32] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [8 x i32] containing signed indexes into m. |
mask | A 256-bit vector of [8 x i32] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4443 of file avx2intrin.h.
#define _mm256_mask_i32gather_epi64 | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
The 256-bit vector of [4 x i64] in mask determines the source for each element.
This intrinsic corresponds to the VPGATHERDQ
instruction.
a | A 256-bit vector of [4 x i64] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. |
mask | A 256-bit vector of [4 x i64] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4639 of file avx2intrin.h.
#define _mm256_mask_i32gather_pd | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
The 256-bit vector of [4 x double] in mask determines the source for each element.
This intrinsic corresponds to the VGATHERDPD
instruction.
a | A 256-bit vector of [4 x double] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. |
mask | A 256-bit vector of [4 x double] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4056 of file avx2intrin.h.
#define _mm256_mask_i32gather_ps | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers eight 32-bit floating-point values, either from the 256-bit vector of [8 x float] in a, or from memory m using scaled indexes from the 256-bit vector of [8 x i32] in i.
The 256-bit vector of [8 x float] in mask determines the source for each element.
This intrinsic corresponds to the VGATHERDPS
instruction.
a | A 256-bit vector of [8 x float] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [8 x i32] containing signed indexes into m. |
mask | A 256-bit vector of [8 x float] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4248 of file avx2intrin.h.
#define _mm256_mask_i64gather_epi32 | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
The 128-bit vector of [4 x i32] in mask determines the source for each element.
This intrinsic corresponds to the VPGATHERQD
instruction.
a | A 128-bit vector of [4 x i32] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [4 x i64] containing signed indexes into m. |
mask | A 128-bit vector of [4 x i32] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4542 of file avx2intrin.h.
#define _mm256_mask_i64gather_epi64 | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers four 64-bit integer values, either from the 256-bit vector of [4 x i64] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
The 256-bit vector of [4 x i64] in mask determines the source for each element.
This intrinsic corresponds to the VPGATHERQQ
instruction.
a | A 256-bit vector of [4 x i64] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [4 x i64] containing signed indexes into m. |
mask | A 256-bit vector of [4 x i64] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4735 of file avx2intrin.h.
#define _mm256_mask_i64gather_pd | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers four 64-bit floating-point values, either from the 256-bit vector of [4 x double] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
The 256-bit vector of [4 x double] in mask determines the source for each element.
This intrinsic corresponds to the VGATHERQPD
instruction.
a | A 256-bit vector of [4 x double] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [4 x i64] containing signed indexes into m. |
mask | A 256-bit vector of [4 x double] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4152 of file avx2intrin.h.
#define _mm256_mask_i64gather_ps | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 256-bit vector of [4 x i64] in i.
The 128-bit vector of [4 x float] in mask determines the source for each element.
This intrinsic corresponds to the VGATHERQPS
instruction.
a | A 128-bit vector of [4 x float] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 256-bit vector of [4 x i64] containing signed indexes into m. |
mask | A 128-bit vector of [4 x float] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4347 of file avx2intrin.h.
#define _mm256_mpsadbw_epu8 | ( | X, | |
Y, | |||
M | |||
) |
Computes sixteen sum of absolute difference (SAD) operations on sets of four unsigned 8-bit integers from the 256-bit integer vectors X and Y.
Eight SAD results are computed using the lower half of the input vectors, and another eight using the upper half. These 16-bit values are returned in the lower and upper halves of the 256-bit result, respectively.
A single SAD operation selects four bytes from X and four bytes from Y as input. It computes the differences between each X byte and the corresponding Y byte, takes the absolute value of each difference, and sums these four values to form one 16-bit result. The intrinsic computes 16 of these results with different sets of input bytes.
For each set of eight results, the SAD operations use the same four bytes from Y; the starting bit position for these four bytes is specified by M[1:0] times 32. The eight operations use successive sets of four bytes from X; the starting bit position for the first set of four bytes is specified by M[2] times 32. These bit positions are all relative to the 128-bit lane for each set of eight operations.
This intrinsic corresponds to the VMPSADBW
instruction.
X | A 256-bit integer vector containing one of the inputs. |
Y | A 256-bit integer vector containing one of the inputs. |
M | An unsigned immediate value specifying the starting positions of the bytes to operate on. |
Definition at line 92 of file avx2intrin.h.
#define _mm256_permute2x128_si256 | ( | V1, | |
V2, | |||
M | |||
) | ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) |
Sets each half of the 256-bit result either to zero or to one of the four possible 128-bit halves of the 256-bit vectors V1 and V2, as specified by the immediate value M.
This intrinsic corresponds to the VPERM2I128
instruction.
V1 | A 256-bit integer vector containing source values. |
V2 | A 256-bit integer vector containing source values. |
M | An immediate value specifying how to form the result. Bits [3:0] control the lower half of the result, bits [7:4] control the upper half. Within each 4-bit control value, if bit 3 is 1, the result is zero, otherwise bits [1:0] determine the source as follows. 0: the lower half of V1 1: the upper half of V1 2: the lower half of V2 3: the upper half of V2 |
Definition at line 3448 of file avx2intrin.h.
#define _mm256_permute4x64_epi64 | ( | V, | |
M | |||
) | ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) |
Sets the result's 256-bit vector of [4 x i64] result to copies of elements of the 256-bit vector of [4 x i64] in V as specified by the immediate value M.
This intrinsic corresponds to the VPERMQ
instruction.
V | A 256-bit vector of [4 x i64] containing the source values. |
M | An immediate 8-bit value specifying which elements to copy from V. M[1:0] specifies the index in a for element 0 of the result, M[3:2] specifies the index for element 1, and so forth. |
Definition at line 3402 of file avx2intrin.h.
#define _mm256_permute4x64_pd | ( | V, | |
M | |||
) | ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) |
Sets the result's 256-bit vector of [4 x double] to copies of elements of the 256-bit vector of [4 x double] in V as specified by the immediate value M.
This intrinsic corresponds to the VPERMPD
instruction.
V | A 256-bit vector of [4 x double] containing the source values. |
M | An immediate 8-bit value specifying which elements to copy from V. M[1:0] specifies the index in a for element 0 of the result, M[3:2] specifies the index for element 1, and so forth. |
Definition at line 3344 of file avx2intrin.h.
#define _mm256_shuffle_epi32 | ( | a, | |
imm | |||
) | ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) |
Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in a according to control information in the integer literal imm, and returns the 256-bit result.
In effect there are two parallel 128-bit shuffles in the lower and upper halves.
This intrinsic corresponds to the VPSHUFB
instruction.
a | A 256-bit vector of [8 x i32] containing source values. |
imm | An immediate 8-bit value specifying which elements to copy from a. imm[1:0] specifies the index in a for elements 0 and 4 of the result, imm[3:2] specifies the index for elements 1 and 5, and so forth. |
Definition at line 1945 of file avx2intrin.h.
#define _mm256_shufflehi_epi16 | ( | a, | |
imm | |||
) | ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) |
Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in a according to control information in the integer literal imm, and returns the 256-bit result.
The upper 64 bits of each 128-bit half are shuffled in parallel; the lower 64 bits of each 128-bit half are copied from a unchanged.
This intrinsic corresponds to the VPSHUFHW
instruction.
a | A 256-bit vector of [16 x i16] containing source values. |
imm | An immediate 8-bit value specifying which elements to copy from a. imm[1:0] specifies the index in a for elements 4 and 8 of the result, imm[3:2] specifies the index for elements 5 and 9, and so forth. Indexes are offset by 4 (so 0 means index 4, and so forth). |
Definition at line 1981 of file avx2intrin.h.
#define _mm256_shufflelo_epi16 | ( | a, | |
imm | |||
) | ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) |
Shuffles 16-bit integers from the 256-bit vector of [16 x i16] a according to control information in the integer literal imm, and returns the 256-bit [16 x i16] result.
The lower 64 bits of each 128-bit half are shuffled; the upper 64 bits of each 128-bit half are copied from a unchanged.
This intrinsic corresponds to the VPSHUFLW
instruction.
a | A 256-bit vector of [16 x i16] to use as a source of data for the result. |
imm | An immediate 8-bit value specifying which elements to copy from a. imm[1:0] specifies the index in a for elements 0 and 8 of the result, imm[3:2] specifies the index for elements 1 and 9, and so forth. |
Definition at line 2018 of file avx2intrin.h.
#define _mm256_slli_si256 | ( | a, | |
imm | |||
) | ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) |
Shifts each 128-bit half of the 256-bit integer vector a left by imm bytes, shifting in zero bytes, and returns the result.
If imm is greater than 15, the returned result is all zeroes.
This intrinsic corresponds to the VPSLLDQ
instruction.
a | A 256-bit integer vector to be shifted. |
imm | An unsigned immediate value specifying the shift count (in bytes). |
Definition at line 2101 of file avx2intrin.h.
#define _mm256_srli_si256 | ( | a, | |
imm | |||
) | ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) |
Shifts each 128-bit half of the 256-bit integer vector in a right by imm bytes, shifting in zero bytes, and returns the result.
If imm is greater than 15, the returned result is all zeroes.
This intrinsic corresponds to the VPSRLDQ
instruction.
a | A 256-bit integer vector to be shifted. |
imm | An unsigned immediate value specifying the shift count (in bytes). |
Definition at line 2345 of file avx2intrin.h.
#define _mm_blend_epi32 | ( | V1, | |
V2, | |||
M | |||
) |
Merges 32-bit integer elements from either of the two 128-bit vectors of [4 x i32] in V1 or V2 to the result's 128-bit vector of [4 x i32], as specified by the immediate integer operand M.
This intrinsic corresponds to the VPBLENDDD
instruction.
V1 | A 128-bit vector of [4 x i32] containing source values. |
V2 | A 128-bit vector of [4 x i32] containing source values. |
M | An immediate 8-bit integer operand, with bits [3:0] specifying the source for each element of the result. The position of the mask bit corresponds to the index of a copied value. When a mask bit is 0, the element is copied from V1; otherwise, it is copied from V2. |
Definition at line 3120 of file avx2intrin.h.
#define _mm_broadcastsi128_si256 | ( | X | ) | _mm256_broadcastsi128_si256(X) |
Definition at line 3085 of file avx2intrin.h.
#define _mm_i32gather_epi32 | ( | m, | |
i, | |||
s | |||
) |
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
This intrinsic corresponds to the VPGATHERDD
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5054 of file avx2intrin.h.
#define _mm_i32gather_epi64 | ( | m, | |
i, | |||
s | |||
) |
Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
This intrinsic corresponds to the VPGATHERDQ
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. Only the first two elements are used. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5185 of file avx2intrin.h.
#define _mm_i32gather_pd | ( | m, | |
i, | |||
s | |||
) |
Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
This intrinsic corresponds to the VGATHERDPD
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. Only the first two elements are used. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4769 of file avx2intrin.h.
#define _mm_i32gather_ps | ( | m, | |
i, | |||
s | |||
) |
Gathers four 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
This intrinsic corresponds to the VGATHERDPS
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4911 of file avx2intrin.h.
#define _mm_i64gather_epi32 | ( | m, | |
i, | |||
s | |||
) |
Gathers two 32-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
The upper two elements of the result are zeroed.
This intrinsic corresponds to the VPGATHERQD
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [2 x i64] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5120 of file avx2intrin.h.
#define _mm_i64gather_epi64 | ( | m, | |
i, | |||
s | |||
) |
Gathers two 64-bit integer values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
This intrinsic corresponds to the VPGATHERQQ
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [2 x i64] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 5251 of file avx2intrin.h.
#define _mm_i64gather_pd | ( | m, | |
i, | |||
s | |||
) |
Gathers two 64-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
This intrinsic corresponds to the VGATHERQPD
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [2 x i64] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4840 of file avx2intrin.h.
#define _mm_i64gather_ps | ( | m, | |
i, | |||
s | |||
) |
Gathers two 32-bit floating-point values from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
The upper two elements of the result are zeroed.
This intrinsic corresponds to the VGATHERQPS
instruction.
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [2 x i64] containing signed indexes into m. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4984 of file avx2intrin.h.
#define _mm_mask_i32gather_epi32 | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers four 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
The 128-bit vector of [4 x i32] in mask determines the source for each element.
This intrinsic corresponds to the VPGATHERDD
instruction.
a | A 128-bit vector of [4 x i32] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. |
mask | A 128-bit vector of [4 x i32] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4395 of file avx2intrin.h.
#define _mm_mask_i32gather_epi64 | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
The 128-bit vector of [2 x i64] in mask determines the source for each element.
This intrinsic corresponds to the VPGATHERDQ
instruction.
a | A 128-bit vector of [2 x i64] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. Only the first two elements are used. |
mask | A 128-bit vector of [2 x i64] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4591 of file avx2intrin.h.
#define _mm_mask_i32gather_pd | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
The 128-bit vector of [2 x double] in mask determines the source for each element.
This intrinsic corresponds to the VGATHERDPD
instruction.
a | A 128-bit vector of [2 x double] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. Only the first two elements are used. |
mask | A 128-bit vector of [2 x double] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4008 of file avx2intrin.h.
#define _mm_mask_i32gather_ps | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers four 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 128-bit vector of [4 x i32] in i.
The 128-bit vector of [4 x float] in mask determines the source for each element.
This intrinsic corresponds to the VGATHERDPS
instruction.
a | A 128-bit vector of [4 x float] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [4 x i32] containing signed indexes into m. |
mask | A 128-bit vector of [4 x float] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4200 of file avx2intrin.h.
#define _mm_mask_i64gather_epi32 | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers two 32-bit integer values, either from the 128-bit vector of [4 x i32] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
The 128-bit vector of [4 x i32] in mask determines the source for the lower two elements. The upper two elements of the result are zeroed.
This intrinsic corresponds to the VPGATHERQD
instruction.
a | A 128-bit vector of [4 x i32] used as the source when a mask bit is zero. Only the first two elements are used. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [2 x i64] containing indexes into m. |
mask | A 128-bit vector of [4 x i32] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. Only the first two elements are used. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4494 of file avx2intrin.h.
#define _mm_mask_i64gather_epi64 | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers two 64-bit integer values, either from the 128-bit vector of [2 x i64] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
The 128-bit vector of [2 x i64] in mask determines the source for each element.
This intrinsic corresponds to the VPGATHERQQ
instruction.
a | A 128-bit vector of [2 x i64] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [2 x i64] containing signed indexes into m. |
mask | A 128-bit vector of [2 x i64] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4687 of file avx2intrin.h.
#define _mm_mask_i64gather_pd | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers two 64-bit floating-point values, either from the 128-bit vector of [2 x double] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
The 128-bit vector of [2 x double] in mask determines the source for each element.
This intrinsic corresponds to the VGATHERQPD
instruction.
a | A 128-bit vector of [2 x double] used as the source when a mask bit is zero. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [2 x i64] containing signed indexes into m. |
mask | A 128-bit vector of [2 x double] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4104 of file avx2intrin.h.
#define _mm_mask_i64gather_ps | ( | a, | |
m, | |||
i, | |||
mask, | |||
s | |||
) |
Conditionally gathers two 32-bit floating-point values, either from the 128-bit vector of [4 x float] in a, or from memory m using scaled indexes from the 128-bit vector of [2 x i64] in i.
The 128-bit vector of [4 x float] in mask determines the source for the lower two elements. The upper two elements of the result are zeroed.
This intrinsic corresponds to the VGATHERQPS
instruction.
a | A 128-bit vector of [4 x float] used as the source when a mask bit is zero. Only the first two elements are used. |
m | A pointer to the memory used for loading values. |
i | A 128-bit vector of [2 x i64] containing signed indexes into m. |
mask | A 128-bit vector of [4 x float] containing the mask. The most significant bit of each element in the mask vector represents the mask bits. If a mask bit is zero, the corresponding value from vector a is gathered; otherwise the value is loaded from memory. Only the first two elements are used. |
s | A literal constant scale factor for the indexes in i. Must be 1, 2, 4, or 8. |
Definition at line 4299 of file avx2intrin.h.
|
static |
Computes the absolute value of each signed 16-bit element in the 256-bit vector of [16 x i16] in __a and returns each value in the corresponding element of the result.
This intrinsic corresponds to the VPABSW
instruction.
__a | A 256-bit vector of [16 x i16]. |
Definition at line 125 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_abs_epi16(), and _mm256_maskz_abs_epi16().
|
static |
Computes the absolute value of each signed 32-bit element in the 256-bit vector of [8 x i32] in __a and returns each value in the corresponding element of the result.
This intrinsic corresponds to the VPABSD
instruction.
__a | A 256-bit vector of [8 x i32]. |
Definition at line 142 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_abs_epi32(), and _mm256_maskz_abs_epi32().
|
static |
Computes the absolute value of each signed byte in the 256-bit integer vector __a and returns each value in the corresponding byte of the result.
This intrinsic corresponds to the VPABSB
instruction.
__a | A 256-bit integer vector. |
Definition at line 108 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_abs_epi8(), and _mm256_maskz_abs_epi8().
|
static |
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each sum in the corresponding element of the [16 x i16] result (overflow is ignored).
This intrinsic corresponds to the VPADDW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 306 of file avx2intrin.h.
Referenced by _mm256_mask_add_epi16(), and _mm256_maskz_add_epi16().
|
static |
Adds 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each sum in the corresponding element of the [8 x i32] result (overflow is ignored).
This intrinsic corresponds to the VPADDD
instruction.
__a | A 256-bit vector of [8 x i32] containing one of the source operands. |
__b | A 256-bit vector of [8 x i32] containing one of the source operands. |
Definition at line 325 of file avx2intrin.h.
Referenced by _mm256_mask_add_epi32(), and _mm256_maskz_add_epi32().
|
static |
Adds 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64] and returns the lower 64 bits of each sum in the corresponding element of the [4 x i64] result (overflow is ignored).
This intrinsic corresponds to the VPADDQ
instruction.
__a | A 256-bit vector of [4 x i64] containing one of the source operands. |
__b | A 256-bit vector of [4 x i64] containing one of the source operands. |
Definition at line 344 of file avx2intrin.h.
Referenced by _mm256_mask_add_epi64(), and _mm256_maskz_add_epi64().
|
static |
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors and returns the lower 8 bits of each sum in the corresponding byte of the 256-bit integer vector result (overflow is ignored).
This intrinsic corresponds to the VPADDB
instruction.
__a | A 256-bit integer vector containing one of the source operands. |
__b | A 256-bit integer vector containing one of the source operands. |
Definition at line 287 of file avx2intrin.h.
Referenced by _mm256_mask_add_epi8(), and _mm256_maskz_add_epi8().
|
static |
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed saturation, and returns the [16 x i16] result.
This intrinsic corresponds to the VPADDSW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 381 of file avx2intrin.h.
Referenced by _mm256_mask_adds_epi16(), and _mm256_maskz_adds_epi16().
|
static |
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation, and returns each sum in the corresponding byte of the 256-bit integer vector result.
This intrinsic corresponds to the VPADDSB
instruction.
__a | A 256-bit integer vector containing one of the source operands. |
__b | A 256-bit integer vector containing one of the source operands. |
Definition at line 363 of file avx2intrin.h.
Referenced by _mm256_mask_adds_epi8(), and _mm256_maskz_adds_epi8().
|
static |
Adds 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
This intrinsic corresponds to the VPADDUSW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 418 of file avx2intrin.h.
Referenced by _mm256_mask_adds_epu16(), and _mm256_maskz_adds_epu16().
|
static |
Adds 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation, and returns each sum in the corresponding byte of the 256-bit integer vector result.
This intrinsic corresponds to the VPADDUSB
instruction.
__a | A 256-bit integer vector containing one of the source operands. |
__b | A 256-bit integer vector containing one of the source operands. |
Definition at line 400 of file avx2intrin.h.
Referenced by _mm256_mask_adds_epu8(), and _mm256_maskz_adds_epu8().
|
static |
Computes the bitwise AND of the 256-bit integer vectors in __a and __b.
This intrinsic corresponds to the VPAND
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 464 of file avx2intrin.h.
Referenced by _mm256_mask_test_epi16_mask(), _mm256_mask_test_epi32_mask(), _mm256_mask_test_epi64_mask(), _mm256_mask_test_epi8_mask(), _mm256_mask_testn_epi16_mask(), _mm256_mask_testn_epi32_mask(), _mm256_mask_testn_epi64_mask(), _mm256_mask_testn_epi8_mask(), _mm256_test_epi16_mask(), _mm256_test_epi32_mask(), _mm256_test_epi64_mask(), _mm256_test_epi8_mask(), _mm256_testn_epi16_mask(), _mm256_testn_epi32_mask(), _mm256_testn_epi64_mask(), and _mm256_testn_epi8_mask().
|
static |
Computes the bitwise AND of the 256-bit integer vector in __b with the bitwise NOT of the 256-bit integer vector in __a.
This intrinsic corresponds to the VPANDN
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 482 of file avx2intrin.h.
|
static |
Computes the averages of the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns each average in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPAVGW
instruction.
__a | A 256-bit vector of [16 x i16]. |
__b | A 256-bit vector of [16 x i16]. |
Definition at line 534 of file avx2intrin.h.
Referenced by _mm256_mask_avg_epu16(), and _mm256_maskz_avg_epu16().
|
static |
Computes the averages of the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns each average in the corresponding byte of the 256-bit result.
This intrinsic corresponds to the VPAVGB
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 508 of file avx2intrin.h.
Referenced by _mm256_mask_avg_epu8(), and _mm256_maskz_avg_epu8().
|
static |
Merges 8-bit integer values from either of the two 256-bit vectors __V1 or __V2, as specified by the 256-bit mask __M and returns the resulting 256-bit integer vector.
This intrinsic corresponds to the VPBLENDVB
instruction.
__V1 | A 256-bit integer vector containing source values. |
__V2 | A 256-bit integer vector containing source values. |
__M | A 256-bit integer vector, with bit [7] of each byte specifying the source for each corresponding byte of the result. When the mask bit is 0, the byte is copied from __V1; otherwise, it is copied from __V2. |
Definition at line 569 of file avx2intrin.h.
|
static |
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 256-bit result.
This intrinsic corresponds to the VPBROADCASTB
instruction.
__X | A 128-bit integer vector whose low byte will be broadcast. |
Definition at line 3172 of file avx2intrin.h.
Referenced by _mm256_mask_broadcastb_epi8(), and _mm256_maskz_broadcastb_epi8().
|
static |
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result's 256-bit vector of [8 x i32].
This intrinsic corresponds to the VPBROADCASTD
instruction.
__X | A 128-bit vector of [4 x i32] whose low element will be broadcast. |
Definition at line 3204 of file avx2intrin.h.
Referenced by _mm256_mask_broadcastd_epi32(), and _mm256_maskz_broadcastd_epi32().
|
static |
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to all elements of the result's 256-bit vector of [4 x i64].
This intrinsic corresponds to the VPBROADCASTQ
instruction.
__X | A 128-bit vector of [2 x i64] whose low element will be broadcast. |
Definition at line 3220 of file avx2intrin.h.
Referenced by _mm256_mask_broadcastq_epi64(), and _mm256_maskz_broadcastq_epi64().
|
static |
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double] in __X to all elements of the result's 256-bit vector of [4 x double].
This intrinsic corresponds to the VBROADCASTSD
instruction.
__X | A 128-bit vector of [2 x double] whose low element will be broadcast. |
Definition at line 3064 of file avx2intrin.h.
Referenced by _mm256_mask_broadcastsd_pd(), and _mm256_maskz_broadcastsd_pd().
|
static |
Broadcasts the 128-bit integer data from __X to both the lower and upper halves of the 256-bit result.
This intrinsic corresponds to the VBROADCASTI128
instruction.
__X | A 128-bit integer vector to be broadcast. |
Definition at line 3080 of file avx2intrin.h.
|
static |
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] in __X to all elements of the result's 256-bit vector of [8 x float].
This intrinsic corresponds to the VBROADCASTSS
instruction.
__X | A 128-bit vector of [4 x float] whose low element will be broadcast. |
Definition at line 3047 of file avx2intrin.h.
Referenced by _mm256_mask_broadcastss_ps(), and _mm256_maskz_broadcastss_ps().
|
static |
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result's 256-bit vector of [16 x i16].
This intrinsic corresponds to the VPBROADCASTW
instruction.
__X | A 128-bit vector of [8 x i16] whose low element will be broadcast. |
Definition at line 3188 of file avx2intrin.h.
Referenced by _mm256_mask_broadcastw_epi16(), and _mm256_maskz_broadcastw_epi16().
|
static |
Compares corresponding elements in the 256-bit vectors of [16 x i16] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.
This intrinsic corresponds to the VPCMPEQW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the inputs. |
__b | A 256-bit vector of [16 x i16] containing one of the inputs. |
Definition at line 663 of file avx2intrin.h.
|
static |
Compares corresponding elements in the 256-bit vectors of [8 x i32] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.
This intrinsic corresponds to the VPCMPEQD
instruction.
__a | A 256-bit vector of [8 x i32] containing one of the inputs. |
__b | A 256-bit vector of [8 x i32] containing one of the inputs. |
Definition at line 689 of file avx2intrin.h.
|
static |
Compares corresponding elements in the 256-bit vectors of [4 x i64] in __a and __b for equality and returns the outcomes in the corresponding elements of the 256-bit result.
This intrinsic corresponds to the VPCMPEQQ
instruction.
__a | A 256-bit vector of [4 x i64] containing one of the inputs. |
__b | A 256-bit vector of [4 x i64] containing one of the inputs. |
Definition at line 715 of file avx2intrin.h.
|
static |
Compares corresponding bytes in the 256-bit integer vectors in __a and __b for equality and returns the outcomes in the corresponding bytes of the 256-bit result.
This intrinsic corresponds to the VPCMPEQB
instruction.
__a | A 256-bit integer vector containing one of the inputs. |
__b | A 256-bit integer vector containing one of the inputs. |
Definition at line 637 of file avx2intrin.h.
|
static |
Compares corresponding signed elements in the 256-bit vectors of [16 x i16] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.
This intrinsic corresponds to the VPCMPGTW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the inputs. |
__b | A 256-bit vector of [16 x i16] containing one of the inputs. |
Definition at line 769 of file avx2intrin.h.
|
static |
Compares corresponding signed elements in the 256-bit vectors of [8 x i32] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.
This intrinsic corresponds to the VPCMPGTD
instruction.
__a | A 256-bit vector of [8 x i32] containing one of the inputs. |
__b | A 256-bit vector of [8 x i32] containing one of the inputs. |
Definition at line 795 of file avx2intrin.h.
|
static |
Compares corresponding signed elements in the 256-bit vectors of [4 x i64] in __a and __b for greater-than and returns the outcomes in the corresponding elements of the 256-bit result.
This intrinsic corresponds to the VPCMPGTQ
instruction.
__a | A 256-bit vector of [4 x i64] containing one of the inputs. |
__b | A 256-bit vector of [4 x i64] containing one of the inputs. |
Definition at line 821 of file avx2intrin.h.
|
static |
Compares corresponding signed bytes in the 256-bit integer vectors in __a and __b for greater-than and returns the outcomes in the corresponding bytes of the 256-bit result.
This intrinsic corresponds to the VPCMPGTB
instruction.
__a | A 256-bit integer vector containing one of the inputs. |
__b | A 256-bit integer vector containing one of the inputs. |
Definition at line 741 of file avx2intrin.h.
|
static |
Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].
This intrinsic corresponds to the VPMOVSXWD
instruction.
__V | A 128-bit vector of [8 x i16] containing the source values. |
Definition at line 1450 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepi16_epi32(), and _mm256_maskz_cvtepi16_epi32().
|
static |
Sign-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
This intrinsic corresponds to the VPMOVSXWQ
instruction.
__V | A 128-bit vector of [8 x i16] containing the source values. |
Definition at line 1475 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepi16_epi64(), and _mm256_maskz_cvtepi16_epi64().
|
static |
Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
This intrinsic corresponds to the VPMOVSXDQ
instruction.
__V | A 128-bit vector of [4 x i32] containing the source values. |
Definition at line 1500 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepi32_epi64(), and _mm256_maskz_cvtepi32_epi64().
|
static |
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corresponding elements of a 256-bit vector of [16 x i16].
This intrinsic corresponds to the VPMOVSXBW
instruction.
__V | A 128-bit integer vector containing the source bytes. |
Definition at line 1367 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepi8_epi16(), and _mm256_maskz_cvtepi8_epi16().
|
static |
Sign-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].
This intrinsic corresponds to the VPMOVSXBD
instruction.
__V | A 128-bit integer vector containing the source bytes. |
Definition at line 1395 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepi8_epi32(), and _mm256_maskz_cvtepi8_epi32().
|
static |
Sign-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
This intrinsic corresponds to the VPMOVSXBQ
instruction.
__V | A 128-bit integer vector containing the source bytes. |
Definition at line 1422 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepi8_epi64(), and _mm256_maskz_cvtepi8_epi64().
|
static |
Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].
This intrinsic corresponds to the VPMOVZXWD
instruction.
__V | A 128-bit vector of [8 x i16] containing the source values. |
Definition at line 1603 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepu16_epi32(), and _mm256_maskz_cvtepu16_epi32().
|
static |
Zero-extends 16-bit elements from the lower half of the 128-bit vector of [8 x i16] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
This intrinsic corresponds to the VPMOVSXWQ
instruction.
__V | A 128-bit vector of [8 x i16] containing the source values. |
Definition at line 1628 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepu16_epi64(), and _mm256_maskz_cvtepu16_epi64().
|
static |
Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
This intrinsic corresponds to the VPMOVZXDQ
instruction.
__V | A 128-bit vector of [4 x i32] containing the source values. |
Definition at line 1653 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepu32_epi64(), and _mm256_maskz_cvtepu32_epi64().
|
static |
Zero-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corresponding elements of a 256-bit vector of [16 x i16].
This intrinsic corresponds to the VPMOVZXBW
instruction.
__V | A 128-bit integer vector containing the source bytes. |
Definition at line 1526 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepu8_epi16(), and _mm256_maskz_cvtepu8_epi16().
|
static |
Zero-extends bytes from the lower half of the 128-bit integer vector in __V and returns the 32-bit values in the corresponding elements of a 256-bit vector of [8 x i32].
This intrinsic corresponds to the VPMOVZXBD
instruction.
__V | A 128-bit integer vector containing the source bytes. |
Definition at line 1552 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepu8_epi32(), and _mm256_maskz_cvtepu8_epi32().
|
static |
Zero-extends the first four bytes from the 128-bit integer vector in __V and returns the 64-bit values in the corresponding elements of a 256-bit vector of [4 x i64].
This intrinsic corresponds to the VPMOVZXBQ
instruction.
__V | A 128-bit integer vector containing the source bytes. |
Definition at line 1577 of file avx2intrin.h.
Referenced by _mm256_mask_cvtepu8_epi64(), and _mm256_maskz_cvtepu8_epi64().
|
static |
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each sum in an element of the [16 x i16] result (overflow is ignored).
Sums from __a are returned in the lower 64 bits of each 128-bit half of the result; sums from __b are returned in the upper 64 bits of each 128-bit half of the result.
This intrinsic corresponds to the VPHADDW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 857 of file avx2intrin.h.
|
static |
Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each sum in an element of the [8 x i32] result (overflow is ignored).
Sums from __a are returned in the lower 64 bits of each 128-bit half of the result; sums from __b are returned in the upper 64 bits of each 128-bit half of the result.
This intrinsic corresponds to the VPHADDD
instruction.
__a | A 256-bit vector of [8 x i32] containing one of the source operands. |
__b | A 256-bit vector of [8 x i32] containing one of the source operands. |
Definition at line 889 of file avx2intrin.h.
|
static |
Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using signed saturation and returns each sum in an element of the [16 x i16] result.
Sums from __a are returned in the lower 64 bits of each 128-bit half of the result; sums from __b are returned in the upper 64 bits of each 128-bit half of the result.
This intrinsic corresponds to the VPHADDSW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 924 of file avx2intrin.h.
|
static |
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] and returns the lower 16 bits of each difference in an element of the [16 x i16] result (overflow is ignored).
Differences from __a are returned in the lower 64 bits of each 128-bit half of the result; differences from __b are returned in the upper 64 bits of each 128-bit half of the result.
This intrinsic corresponds to the VPHSUBW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 960 of file avx2intrin.h.
|
static |
Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit vectors of [8 x i32] and returns the lower 32 bits of each difference in an element of the [8 x i32] result (overflow is ignored).
Differences from __a are returned in the lower 64 bits of each 128-bit half of the result; differences from __b are returned in the upper 64 bits of each 128-bit half of the result.
This intrinsic corresponds to the VPHSUBD
instruction.
__a | A 256-bit vector of [8 x i32] containing one of the source operands. |
__b | A 256-bit vector of [8 x i32] containing one of the source operands. |
Definition at line 992 of file avx2intrin.h.
|
static |
Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit vectors of [16 x i16] using signed saturation and returns each sum in an element of the [16 x i16] result.
Differences from __a are returned in the lower 64 bits of each 128-bit half of the result; differences from __b are returned in the upper 64 bits of each 128-bit half of the result.
This intrinsic corresponds to the VPHSUBSW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 1028 of file avx2intrin.h.
|
static |
Multiplies corresponding 16-bit elements of two 256-bit vectors of [16 x i16], forming 32-bit intermediate products, and adds pairs of those products to form 32-bit sums returned as elements of the [8 x i32] result.
There is only one wraparound case: when all four of the 16-bit sources are 0x8000
, the result will be 0x80000000
.
This intrinsic corresponds to the VPMADDWD
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 1090 of file avx2intrin.h.
Referenced by _mm256_mask_madd_epi16(), and _mm256_maskz_madd_epi16().
|
static |
Multiplies each unsigned byte from the 256-bit integer vector in __a with the corresponding signed byte from the 256-bit integer vector in __b, forming signed 16-bit intermediate products.
Adds adjacent pairs of those products using signed saturation to form 16-bit sums returned as elements of the [16 x i16] result.
This intrinsic corresponds to the VPMADDUBSW
instruction.
__a | A 256-bit vector containing one of the source operands. |
__b | A 256-bit vector containing one of the source operands. |
Definition at line 1058 of file avx2intrin.h.
Referenced by _mm256_mask_maddubs_epi16(), and _mm256_maskz_maddubs_epi16().
|
static |
Conditionally loads eight 32-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.
Returns the 256-bit [8 x i32] result.
This intrinsic corresponds to the VPMASKMOVD
instruction.
__X | A pointer to the memory used for loading values. |
__M | A 256-bit vector of [8 x i32] containing the mask bits. |
Definition at line 3522 of file avx2intrin.h.
|
static |
Conditionally loads four 64-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.
Returns the 256-bit [4 x i64] result.
This intrinsic corresponds to the VPMASKMOVQ
instruction.
__X | A pointer to the memory used for loading values. |
__M | A 256-bit vector of [4 x i64] containing the mask bits. |
Definition at line 3554 of file avx2intrin.h.
|
static |
Conditionally stores eight 32-bit integer elements from the 256-bit vector of [8 x i32] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.
This intrinsic corresponds to the VPMASKMOVD
instruction.
__X | A pointer to the memory used for storing values. |
__M | A 256-bit vector of [8 x i32] containing the mask bits. |
__Y | A 256-bit vector of [8 x i32] containing the values to store. |
Definition at line 3648 of file avx2intrin.h.
References __Y.
|
static |
Conditionally stores four 64-bit integer elements from the 256-bit vector of [4 x i64] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.
This intrinsic corresponds to the VPMASKMOVQ
instruction.
__X | A pointer to the memory used for storing values. |
__M | A 256-bit vector of [4 x i64] containing the mask bits. |
__Y | A 256-bit vector of [4 x i64] containing the values to store. |
Definition at line 3678 of file avx2intrin.h.
References __Y.
|
static |
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPMAXSW
instruction.
__a | A 256-bit vector of [16 x i16]. |
__b | A 256-bit vector of [16 x i16]. |
Definition at line 1128 of file avx2intrin.h.
Referenced by _mm256_mask_max_epi16(), and _mm256_maskz_max_epi16().
|
static |
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPMAXSD
instruction.
__a | A 256-bit vector of [8 x i32]. |
__b | A 256-bit vector of [8 x i32]. |
Definition at line 1147 of file avx2intrin.h.
Referenced by _mm256_mask_max_epi32(), and _mm256_maskz_max_epi32().
|
static |
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns the larger of each pair in the corresponding byte of the 256-bit result.
This intrinsic corresponds to the VPMAXSB
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 1109 of file avx2intrin.h.
Referenced by _mm256_mask_max_epi8(), and _mm256_maskz_max_epi8().
|
static |
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPMAXUW
instruction.
__a | A 256-bit vector of [16 x i16]. |
__b | A 256-bit vector of [16 x i16]. |
Definition at line 1185 of file avx2intrin.h.
Referenced by _mm256_mask_max_epu16(), and _mm256_maskz_max_epu16().
|
static |
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the larger of each pair in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPMAXUD
instruction.
__a | A 256-bit vector of [8 x i32]. |
__b | A 256-bit vector of [8 x i32]. |
Definition at line 1204 of file avx2intrin.h.
Referenced by _mm256_mask_max_epu32(), and _mm256_maskz_max_epu32().
|
static |
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns the larger of each pair in the corresponding byte of the 256-bit result.
This intrinsic corresponds to the VPMAXUB
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 1166 of file avx2intrin.h.
Referenced by _mm256_mask_max_epu8(), and _mm256_maskz_max_epu8().
|
static |
Compares the corresponding signed 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPMINSW
instruction.
__a | A 256-bit vector of [16 x i16]. |
__b | A 256-bit vector of [16 x i16]. |
Definition at line 1242 of file avx2intrin.h.
Referenced by _mm256_mask_min_epi16(), and _mm256_maskz_min_epi16().
|
static |
Compares the corresponding signed 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPMINSD
instruction.
__a | A 256-bit vector of [8 x i32]. |
__b | A 256-bit vector of [8 x i32]. |
Definition at line 1261 of file avx2intrin.h.
Referenced by _mm256_mask_min_epi32(), and _mm256_maskz_min_epi32().
|
static |
Compares the corresponding signed bytes in the two 256-bit integer vectors in __a and __b and returns the smaller of each pair in the corresponding byte of the 256-bit result.
This intrinsic corresponds to the VPMINSB
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 1223 of file avx2intrin.h.
Referenced by _mm256_mask_min_epi8(), and _mm256_maskz_min_epi8().
|
static |
Compares the corresponding unsigned 16-bit integers in the two 256-bit vectors of [16 x i16] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPMINUW
instruction.
__a | A 256-bit vector of [16 x i16]. |
__b | A 256-bit vector of [16 x i16]. |
Definition at line 1299 of file avx2intrin.h.
Referenced by _mm256_mask_min_epu16(), and _mm256_maskz_min_epu16().
|
static |
Compares the corresponding unsigned 32-bit integers in the two 256-bit vectors of [8 x i32] in __a and __b and returns the smaller of each pair in the corresponding element of the 256-bit result.
This intrinsic corresponds to the VPMINUD
instruction.
__a | A 256-bit vector of [8 x i32]. |
__b | A 256-bit vector of [8 x i32]. |
Definition at line 1318 of file avx2intrin.h.
Referenced by _mm256_mask_min_epu32(), and _mm256_maskz_min_epu32().
|
static |
Compares the corresponding unsigned bytes in the two 256-bit integer vectors in __a and __b and returns the smaller of each pair in the corresponding byte of the 256-bit result.
This intrinsic corresponds to the VPMINUB
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 1280 of file avx2intrin.h.
Referenced by _mm256_mask_min_epu8(), and _mm256_maskz_min_epu8().
|
static |
Creates a 32-bit integer mask from the most significant bit of each byte in the 256-bit integer vector in __a and returns the result.
This intrinsic corresponds to the VPMOVMSKB
instruction.
__a | A 256-bit integer vector containing the source bytes. |
Definition at line 1341 of file avx2intrin.h.
References __a.
|
static |
Multiplies signed 32-bit integers from even-numbered elements of two 256-bit vectors of [8 x i32] and returns the 64-bit products in the [4 x i64] result.
This intrinsic corresponds to the VPMULDQ
instruction.
__a | A 256-bit vector of [8 x i32] containing one of the source operands. |
__b | A 256-bit vector of [8 x i32] containing one of the source operands. |
Definition at line 1679 of file avx2intrin.h.
Referenced by _mm256_mask_mul_epi32(), and _mm256_maskz_mul_epi32().
|
static |
Multiplies unsigned 32-bit integers from even-numered elements of two 256-bit vectors of [8 x i32] and returns the 64-bit products in the [4 x i64] result.
This intrinsic corresponds to the VPMULUDQ
instruction.
__a | A 256-bit vector of [8 x i32] containing one of the source operands. |
__b | A 256-bit vector of [8 x i32] containing one of the source operands. |
Definition at line 1808 of file avx2intrin.h.
Referenced by _mm256_mask_mul_epu32(), and _mm256_maskz_mul_epu32().
|
static |
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper 16 bits of each 32-bit product in the [16 x i16] result.
This intrinsic corresponds to the VPMULHW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 1744 of file avx2intrin.h.
Referenced by _mm256_mask_mulhi_epi16(), and _mm256_maskz_mulhi_epi16().
|
static |
Multiplies unsigned 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the upper 16 bits of each 32-bit product in the [16 x i16] result.
This intrinsic corresponds to the VPMULHUW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 1725 of file avx2intrin.h.
Referenced by _mm256_mask_mulhi_epu16(), and _mm256_maskz_mulhi_epu16().
|
static |
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], truncates the 32-bit results to the most significant 18 bits, rounds by adding 1, and returns bits [16:1] of each rounded product in the [16 x i16] result.
This intrinsic corresponds to the VPMULHRSW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 1706 of file avx2intrin.h.
Referenced by _mm256_mask_mulhrs_epi16(), and _mm256_maskz_mulhrs_epi16().
|
static |
Multiplies signed 16-bit integer elements of two 256-bit vectors of [16 x i16], and returns the lower 16 bits of each 32-bit product in the [16 x i16] result.
This intrinsic corresponds to the VPMULLW
instruction.
__a | A 256-bit vector of [16 x i16] containing one of the source operands. |
__b | A 256-bit vector of [16 x i16] containing one of the source operands. |
Definition at line 1763 of file avx2intrin.h.
Referenced by _mm256_mask_mullo_epi16(), and _mm256_maskz_mullo_epi16().
|
static |
Multiplies signed 32-bit integer elements of two 256-bit vectors of [8 x i32], and returns the lower 32 bits of each 64-bit product in the [8 x i32] result.
This intrinsic corresponds to the VPMULLD
instruction.
__a | A 256-bit vector of [8 x i32] containing one of the source operands. |
__b | A 256-bit vector of [8 x i32] containing one of the source operands. |
Definition at line 1782 of file avx2intrin.h.
Referenced by _mm256_mask_mullo_epi32(), and _mm256_maskz_mullo_epi32().
|
static |
Computes the bitwise OR of the 256-bit integer vectors in __a and __b.
This intrinsic corresponds to the VPOR
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 1826 of file avx2intrin.h.
|
static |
Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit integers using signed saturation, and returns the 256-bit result.
This intrinsic corresponds to the VPACKSSWB
instruction.
__a | A 256-bit vector of [16 x i16] used to generate result[63:0] and result[191:128]. |
__b | A 256-bit vector of [16 x i16] used to generate result[127:64] and result[255:192]. |
Definition at line 173 of file avx2intrin.h.
Referenced by _mm256_mask_packs_epi16(), and _mm256_maskz_packs_epi16().
|
static |
Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit integers using signed saturation, and returns the resulting 256-bit vector of [16 x i16].
This intrinsic corresponds to the VPACKSSDW
instruction.
__a | A 256-bit vector of [8 x i32] used to generate result[63:0] and result[191:128]. |
__b | A 256-bit vector of [8 x i32] used to generate result[127:64] and result[255:192]. |
Definition at line 205 of file avx2intrin.h.
Referenced by _mm256_mask_packs_epi32(), and _mm256_maskz_packs_epi32().
|
static |
Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers using unsigned saturation, and returns the 256-bit result.
This intrinsic corresponds to the VPACKUSWB
instruction.
__a | A 256-bit vector of [16 x i16] used to generate result[63:0] and result[191:128]. |
__b | A 256-bit vector of [16 x i16] used to generate result[127:64] and result[255:192]. |
Definition at line 236 of file avx2intrin.h.
Referenced by _mm256_mask_packus_epi16(), and _mm256_maskz_packus_epi16().
|
static |
Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers using unsigned saturation, and returns the resulting 256-bit vector of [16 x i16].
This intrinsic corresponds to the VPACKUSDW
instruction.
__V1 | A 256-bit vector of [8 x i32] used to generate result[63:0] and result[191:128]. |
__V2 | A 256-bit vector of [8 x i32] used to generate result[127:64] and result[255:192]. |
Definition at line 268 of file avx2intrin.h.
Referenced by _mm256_mask_packus_epi32(), and _mm256_maskz_packus_epi32().
|
static |
Sets the result's 256-bit vector of [8 x i32] to copies of elements of the 256-bit vector of [8 x i32] in __a as specified by indexes in the elements of the 256-bit vector of [8 x i32] in __b.
This intrinsic corresponds to the VPERMD
instruction.
__a | A 256-bit vector of [8 x i32] containing the source values. |
__b | A 256-bit vector of [8 x i32] containing indexes of values to use from __a. |
Definition at line 3312 of file avx2intrin.h.
|
static |
Sets the result's 256-bit vector of [8 x float] to copies of elements of the 256-bit vector of [8 x float] in __a as specified by indexes in the elements of the 256-bit vector of [8 x i32] in __b.
This intrinsic corresponds to the VPERMPS
instruction.
__a | A 256-bit vector of [8 x float] containing the source values. |
__b | A 256-bit vector of [8 x i32] containing indexes of values to use from __a. |
Definition at line 3370 of file avx2intrin.h.
|
static |
Computes four sum of absolute difference (SAD) operations on sets of eight unsigned 8-bit integers from the 256-bit integer vectors __a and __b.
One SAD result is computed for each set of eight bytes from __a and eight bytes from __b. The zero-extended SAD value is returned in the corresponding 64-bit element of the result.
A single SAD operation takes the differences between the corresponding bytes of __a and __b, takes the absolute value of each difference, and sums these eight values to form one 16-bit result. This operation is repeated four times with successive sets of eight bytes.
This intrinsic corresponds to the VPSADBW
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 1871 of file avx2intrin.h.
|
static |
Shuffles 8-bit integers in the 256-bit integer vector __a according to control information in the 256-bit integer vector __b, and returns the 256-bit result.
In effect there are two separate 128-bit shuffles in the lower and upper halves.
This intrinsic corresponds to the VPSHUFB
instruction.
__a | A 256-bit integer vector containing source values. |
__b | A 256-bit integer vector containing control information to determine what goes into the corresponding byte of the result. If bit 7 of the control byte is 1, the result byte is 0; otherwise, bits 3:0 of the control byte specify the index (within the same 128-bit half) of __a to copy to the result byte. |
Definition at line 1910 of file avx2intrin.h.
Referenced by _mm256_mask_shuffle_epi8(), and _mm256_maskz_shuffle_epi8().
|
static |
Sets each element of the result to the corresponding element of the 256-bit vector of [16 x i16] in __a, the negative of that element, or zero, depending on whether the corresponding element of the 256-bit vector of [16 x i16] in __b is greater than zero, less than zero, or equal to zero, respectively.
This intrinsic corresponds to the VPSIGNW
instruction.
__a | A 256-bit vector of [16 x i16]. |
__b | A 256-bit vector of [16 x i16]. |
Definition at line 2058 of file avx2intrin.h.
|
static |
Sets each element of the result to the corresponding element of the 256-bit vector of [8 x i32] in __a, the negative of that element, or zero, depending on whether the corresponding element of the 256-bit vector of [8 x i32] in __b is greater than zero, less than zero, or equal to zero, respectively.
This intrinsic corresponds to the VPSIGND
instruction.
__a | A 256-bit vector of [8 x i32]. |
__b | A 256-bit vector of [8 x i32]. |
Definition at line 2079 of file avx2intrin.h.
|
static |
Sets each byte of the result to the corresponding byte of the 256-bit integer vector in __a, the negative of that byte, or zero, depending on whether the corresponding byte of the 256-bit integer vector in __b is greater than zero, less than zero, or equal to zero, respectively.
This intrinsic corresponds to the VPSIGNB
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector]. |
Definition at line 2037 of file avx2intrin.h.
|
static |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by the number of bits specified by the lower 64 bits of __count, shifting in zero bits, and returns the result.
If __count is greater than 15, the returned result is all zeroes.
This intrinsic corresponds to the VPSLLW
instruction.
__a | A 256-bit vector of [16 x i16] to be shifted. |
__count | A 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored. |
Definition at line 2159 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_sll_epi16(), and _mm256_maskz_sll_epi16().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
If __count is greater than 31, the returned result is all zeroes.
This intrinsic corresponds to the VPSLLD
instruction.
__a | A 256-bit vector of [8 x i32] to be shifted. |
__count | A 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored. |
Definition at line 2199 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_sll_epi32(), and _mm256_maskz_sll_epi32().
|
static |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
If __count is greater than 63, the returned result is all zeroes.
This intrinsic corresponds to the VPSLLQ
instruction.
__a | A 256-bit vector of [4 x i64] to be shifted. |
__count | A 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored. |
Definition at line 2239 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_sll_epi64(), and _mm256_maskz_sll_epi64().
|
static |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits, shifting in zero bits, and returns the result.
If __count is greater than 15, the returned result is all zeroes.
This intrinsic corresponds to the VPSLLW
instruction.
__a | A 256-bit vector of [16 x i16] to be shifted. |
__count | An unsigned integer value specifying the shift count (in bits). |
Definition at line 2138 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_slli_epi16(), and _mm256_maskz_slli_epi16().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a left by __count bits, shifting in zero bits, and returns the result.
If __count is greater than 31, the returned result is all zeroes.
This intrinsic corresponds to the VPSLLD
instruction.
__a | A 256-bit vector of [8 x i32] to be shifted. |
__count | An unsigned integer value specifying the shift count (in bits). |
Definition at line 2178 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_slli_epi32(), and _mm256_maskz_slli_epi32().
|
static |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a left by __count bits, shifting in zero bits, and returns the result.
If __count is greater than 63, the returned result is all zeroes.
This intrinsic corresponds to the VPSLLQ
instruction.
__a | A 256-bit vector of [4 x i64] to be shifted. |
__count | An unsigned integer value specifying the shift count (in bits). |
Definition at line 2218 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_slli_epi64(), and _mm256_maskz_slli_epi64().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X left by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in zero bits, and returns the result.
If the shift count for any element is greater than 31, the result for that element is zero.
This intrinsic corresponds to the VPSLLVD
instruction.
__X | A 256-bit vector of [8 x i32] to be shifted. |
__Y | A 256-bit vector of [8 x i32] containing the unsigned shift counts (in bits). |
Definition at line 3760 of file avx2intrin.h.
References __Y.
Referenced by _mm256_mask_sllv_epi32(), and _mm256_maskz_sllv_epi32().
|
static |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [4 x i64] in __Y, shifting in zero bits, and returns the result.
If the shift count for any element is greater than 63, the result for that element is zero.
This intrinsic corresponds to the VPSLLVQ
instruction.
__X | A 256-bit vector of [4 x i64] to be shifted. |
__Y | A 256-bit vector of [4 x i64] containing the unsigned shift counts (in bits). |
Definition at line 3804 of file avx2intrin.h.
References __Y.
Referenced by _mm256_mask_sllv_epi64(), and _mm256_maskz_sllv_epi64().
|
static |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits given in the lower 64 bits of __count, shifting in sign bits, and returns the result.
If __count is greater than 15, each element of the result is either 0 or -1 according to the corresponding input sign bit.
This intrinsic corresponds to the VPSRAW
instruction.
__a | A 256-bit vector of [16 x i16] to be shifted. |
__count | A 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored. |
Definition at line 2281 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_sra_epi16(), and _mm256_maskz_sra_epi16().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits given in the lower 64 bits of __count, shifting in sign bits, and returns the result.
If __count is greater than 31, each element of the result is either 0 or -1 according to the corresponding input sign bit.
This intrinsic corresponds to the VPSRAD
instruction.
__a | A 256-bit vector of [8 x i32] to be shifted. |
__count | A 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored. |
Definition at line 2323 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_sra_epi32(), and _mm256_maskz_sra_epi32().
|
static |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits, shifting in sign bits, and returns the result.
If __count is greater than 15, each element of the result is either 0 or -1 according to the corresponding input sign bit.
This intrinsic corresponds to the VPSRAW
instruction.
__a | A 256-bit vector of [16 x i16] to be shifted. |
__count | An unsigned integer value specifying the shift count (in bits). |
Definition at line 2259 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_srai_epi16(), and _mm256_maskz_srai_epi16().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits, shifting in sign bits, and returns the result.
If __count is greater than 31, each element of the result is either 0 or -1 according to the corresponding input sign bit.
This intrinsic corresponds to the VPSRAD
instruction.
__a | A 256-bit vector of [8 x i32] to be shifted. |
__count | An unsigned integer value specifying the shift count (in bits). |
Definition at line 2301 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_srai_epi32(), and _mm256_maskz_srai_epi32().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in sign bits, and returns the result.
If the shift count for any element is greater than 31, the result for that element is 0 or -1 according to the sign bit for that element.
This intrinsic corresponds to the VPSRAVD
instruction.
__X | A 256-bit vector of [8 x i32] to be shifted. |
__Y | A 256-bit vector of [8 x i32] containing the unsigned shift counts (in bits). |
Definition at line 3849 of file avx2intrin.h.
References __Y.
Referenced by _mm256_mask_srav_epi32(), and _mm256_maskz_srav_epi32().
|
static |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
If __count is greater than 15, the returned result is all zeroes.
This intrinsic corresponds to the VPSRLW
instruction.
__a | A 256-bit vector of [16 x i16] to be shifted. |
__count | A 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored. |
Definition at line 2403 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_srl_epi16(), and _mm256_maskz_srl_epi16().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
If __count is greater than 31, the returned result is all zeroes.
This intrinsic corresponds to the VPSRLD
instruction.
__a | A 256-bit vector of [8 x i32] to be shifted. |
__count | A 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored. |
Definition at line 2443 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_srl_epi32(), and _mm256_maskz_srl_epi32().
|
static |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by the number of bits given in the lower 64 bits of __count, shifting in zero bits, and returns the result.
If __count is greater than 63, the returned result is all zeroes.
This intrinsic corresponds to the VPSRLQ
instruction.
__a | A 256-bit vector of [4 x i64] to be shifted. |
__count | A 128-bit vector of [2 x i64] whose lower element gives the unsigned shift count (in bits). The upper element is ignored. |
Definition at line 2483 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_srl_epi64(), and _mm256_maskz_srl_epi64().
|
static |
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a right by __count bits, shifting in zero bits, and returns the result.
If __count is greater than 15, the returned result is all zeroes.
This intrinsic corresponds to the VPSRLW
instruction.
__a | A 256-bit vector of [16 x i16] to be shifted. |
__count | An unsigned integer value specifying the shift count (in bits). |
Definition at line 2382 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_srli_epi16(), and _mm256_maskz_srli_epi16().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __a right by __count bits, shifting in zero bits, and returns the result.
If __count is greater than 31, the returned result is all zeroes.
This intrinsic corresponds to the VPSRLD
instruction.
__a | A 256-bit vector of [8 x i32] to be shifted. |
__count | An unsigned integer value specifying the shift count (in bits). |
Definition at line 2422 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_srli_epi32(), and _mm256_maskz_srli_epi32().
|
static |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __a right by __count bits, shifting in zero bits, and returns the result.
If __count is greater than 63, the returned result is all zeroes.
This intrinsic corresponds to the VPSRLQ
instruction.
__a | A 256-bit vector of [4 x i64] to be shifted. |
__count | An unsigned integer value specifying the shift count (in bits). |
Definition at line 2462 of file avx2intrin.h.
References __a.
Referenced by _mm256_mask_srli_epi64(), and _mm256_maskz_srli_epi64().
|
static |
Shifts each 32-bit element of the 256-bit vector of [8 x i32] in __X right by the number of bits given in the corresponding element of the 256-bit vector of [8 x i32] in __Y, shifting in zero bits, and returns the result.
If the shift count for any element is greater than 31, the result for that element is zero.
This intrinsic corresponds to the VPSRLVD
instruction.
__X | A 256-bit vector of [8 x i32] to be shifted. |
__Y | A 256-bit vector of [8 x i32] containing the unsigned shift counts (in bits). |
Definition at line 3894 of file avx2intrin.h.
References __Y.
Referenced by _mm256_mask_srlv_epi32(), and _mm256_maskz_srlv_epi32().
|
static |
Shifts each 64-bit element of the 256-bit vector of [4 x i64] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i64] in __Y, shifting in zero bits, and returns the result.
If the shift count for any element is greater than 63, the result for that element is zero.
This intrinsic corresponds to the VPSRLVQ
instruction.
__X | A 256-bit vector of [4 x i64] to be shifted. |
__Y | A 256-bit vector of [4 x i64] containing the unsigned shift counts (in bits). |
Definition at line 3938 of file avx2intrin.h.
References __Y.
Referenced by _mm256_mask_srlv_epi64(), and _mm256_maskz_srlv_epi64().
|
static |
Loads the 256-bit integer vector from memory __V using a non-temporal memory hint and returns the vector.
__V must be aligned on a 32-byte boundary.
This intrinsic corresponds to the VMOVNTDQA
instruction.
__V | A pointer to the 32-byte aligned memory containing the vector to load. |
Definition at line 2995 of file avx2intrin.h.
|
static |
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16].
Returns the lower 16 bits of each difference in the corresponding element of the [16 x i16] result (overflow is ignored).
This intrinsic corresponds to the VPSUBW
instruction.
__a | A 256-bit vector of [16 x i16] containing the minuends. |
__b | A 256-bit vector of [16 x i16] containing the subtrahends. |
Definition at line 2537 of file avx2intrin.h.
Referenced by _mm256_mask_sub_epi16(), and _mm256_maskz_sub_epi16().
|
static |
Subtracts 32-bit integers from corresponding elements of two 256-bit vectors of [8 x i32].
Returns the lower 32 bits of each difference in the corresponding element of the [8 x i32] result (overflow is ignored).
This intrinsic corresponds to the VPSUBD
instruction.
__a | A 256-bit vector of [8 x i32] containing the minuends. |
__b | A 256-bit vector of [8 x i32] containing the subtrahends. |
Definition at line 2563 of file avx2intrin.h.
Referenced by _mm256_mask_sub_epi32(), and _mm256_maskz_sub_epi32().
|
static |
Subtracts 64-bit integers from corresponding elements of two 256-bit vectors of [4 x i64].
Returns the lower 64 bits of each difference in the corresponding element of the [4 x i64] result (overflow is ignored).
This intrinsic corresponds to the VPSUBQ
instruction.
__a | A 256-bit vector of [4 x i64] containing the minuends. |
__b | A 256-bit vector of [4 x i64] containing the subtrahends. |
Definition at line 2589 of file avx2intrin.h.
Referenced by _mm256_mask_sub_epi64(), and _mm256_maskz_sub_epi64().
|
static |
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors.
Returns the lower 8 bits of each difference in the corresponding byte of the 256-bit integer vector result (overflow is ignored).
This intrinsic corresponds to the VPSUBB
instruction.
__a | A 256-bit integer vector containing the minuends. |
__b | A 256-bit integer vector containing the subtrahends. |
Definition at line 2510 of file avx2intrin.h.
Referenced by _mm256_mask_sub_epi8(), and _mm256_maskz_sub_epi8().
|
static |
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using signed saturation, and returns each difference in the corresponding element of the [16 x i16] result.
This intrinsic corresponds to the VPSUBSW
instruction.
__a | A 256-bit vector of [16 x i16] containing the minuends. |
__b | A 256-bit vector of [16 x i16] containing the subtrahends. |
Definition at line 2641 of file avx2intrin.h.
Referenced by _mm256_mask_subs_epi16(), and _mm256_maskz_subs_epi16().
|
static |
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using signed saturation, and returns each differences in the corresponding byte of the 256-bit integer vector result.
This intrinsic corresponds to the VPSUBSB
instruction.
__a | A 256-bit integer vector containing the minuends. |
__b | A 256-bit integer vector containing the subtrahends. |
Definition at line 2615 of file avx2intrin.h.
Referenced by _mm256_mask_subs_epi8(), and _mm256_maskz_subs_epi8().
|
static |
Subtracts 16-bit integers from corresponding elements of two 256-bit vectors of [16 x i16] using unsigned saturation, and returns each difference in the corresponding element of the [16 x i16] result.
This intrinsic corresponds to the VPSUBUSW
instruction.
__a | A 256-bit vector of [16 x i16] containing the minuends. |
__b | A 256-bit vector of [16 x i16] containing the subtrahends. |
Definition at line 2694 of file avx2intrin.h.
Referenced by _mm256_mask_subs_epu16(), and _mm256_maskz_subs_epu16().
|
static |
Subtracts 8-bit integers from corresponding bytes of two 256-bit integer vectors using unsigned saturation, and returns each difference in the corresponding byte of the 256-bit integer vector result.
For each byte, computes result = __a - __b
.
This intrinsic corresponds to the VPSUBUSB
instruction.
__a | A 256-bit integer vector containing the minuends. |
__b | A 256-bit integer vector containing the subtrahends. |
Definition at line 2668 of file avx2intrin.h.
Referenced by _mm256_mask_subs_epu8(), and _mm256_maskz_subs_epu8().
|
static |
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __b to return the resulting 256-bit vector of [16 x i16].
Specifically, uses the upper 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.
This intrinsic corresponds to the VPUNPCKHWD
instruction.
__a | A 256-bit vector of [16 x i16] used as the source for the even-numbered elements of the result. |
__b | A 256-bit vector of [16 x i16] used as the source for the odd-numbered elements of the result. |
Definition at line 2763 of file avx2intrin.h.
Referenced by _mm256_mask_unpackhi_epi16(), and _mm256_maskz_unpackhi_epi16().
|
static |
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b to return the resulting 256-bit vector of [8 x i32].
Specifically, uses the upper 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.
This intrinsic corresponds to the VPUNPCKHDQ
instruction.
__a | A 256-bit vector of [8 x i32] used as the source for the even-numbered elements of the result. |
__b | A 256-bit vector of [8 x i32] used as the source for the odd-numbered elements of the result. |
Definition at line 2797 of file avx2intrin.h.
Referenced by _mm256_mask_unpackhi_epi32(), and _mm256_maskz_unpackhi_epi32().
|
static |
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b to return the resulting 256-bit vector of [4 x i64].
Specifically, uses the upper 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.
This intrinsic corresponds to the VPUNPCKHQDQ
instruction.
__a | A 256-bit vector of [4 x i64] used as the source for the even-numbered elements of the result. |
__b | A 256-bit vector of [4 x i64] used as the source for the odd-numbered elements of the result. |
Definition at line 2827 of file avx2intrin.h.
Referenced by _mm256_mask_unpackhi_epi64(), and _mm256_maskz_unpackhi_epi64().
|
static |
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to form the 256-bit result.
Specifically, uses the upper 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.
This intrinsic corresponds to the VPUNPCKHBW
instruction.
__a | A 256-bit integer vector used as the source for the even-numbered bytes of the result. |
__b | A 256-bit integer vector used as the source for the odd-numbered bytes of the result. |
Definition at line 2728 of file avx2intrin.h.
Referenced by _mm256_mask_unpackhi_epi8(), and _mm256_maskz_unpackhi_epi8().
|
static |
Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors of [16 x i16] in __a and __b to return the resulting 256-bit vector of [16 x i16].
Specifically, uses the lower 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.
This intrinsic corresponds to the VPUNPCKLWD
instruction.
__a | A 256-bit vector of [16 x i16] used as the source for the even-numbered elements of the result. |
__b | A 256-bit vector of [16 x i16] used as the source for the odd-numbered elements of the result. |
Definition at line 2896 of file avx2intrin.h.
Referenced by _mm256_mask_unpacklo_epi16(), and _mm256_maskz_unpacklo_epi16().
|
static |
Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors of [8 x i32] in __a and __b to return the resulting 256-bit vector of [8 x i32].
Specifically, uses the lower 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.
This intrinsic corresponds to the VPUNPCKLDQ
instruction.
__a | A 256-bit vector of [8 x i32] used as the source for the even-numbered elements of the result. |
__b | A 256-bit vector of [8 x i32] used as the source for the odd-numbered elements of the result. |
Definition at line 2930 of file avx2intrin.h.
Referenced by _mm256_mask_unpacklo_epi32(), and _mm256_maskz_unpacklo_epi32().
|
static |
Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors of [4 x i64] in __a and __b to return the resulting 256-bit vector of [4 x i64].
Specifically, uses the lower 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.
This intrinsic corresponds to the VPUNPCKLQDQ
instruction.
__a | A 256-bit vector of [4 x i64] used as the source for the even-numbered elements of the result. |
__b | A 256-bit vector of [4 x i64] used as the source for the odd-numbered elements of the result. |
Definition at line 2960 of file avx2intrin.h.
Referenced by _mm256_mask_unpacklo_epi64(), and _mm256_maskz_unpacklo_epi64().
|
static |
Unpacks and interleaves 8-bit integers from parts of the 256-bit integer vectors in __a and __b to form the 256-bit result.
Specifically, uses the lower 64 bits of each 128-bit half of __a and __b as input; other bits in these parameters are ignored.
This intrinsic corresponds to the VPUNPCKLBW
instruction.
__a | A 256-bit integer vector used as the source for the even-numbered bytes of the result. |
__b | A 256-bit integer vector used as the source for the odd-numbered bytes of the result. |
Definition at line 2861 of file avx2intrin.h.
Referenced by _mm256_mask_unpacklo_epi8(), and _mm256_maskz_unpacklo_epi8().
|
static |
Computes the bitwise XOR of the 256-bit integer vectors in __a and __b.
This intrinsic corresponds to the VPXOR
instruction.
__a | A 256-bit integer vector. |
__b | A 256-bit integer vector. |
Definition at line 2978 of file avx2intrin.h.
|
static |
Broadcasts the low byte from the 128-bit integer vector in __X to all bytes of the 128-bit result.
This intrinsic corresponds to the VPBROADCASTB
instruction.
__X | A 128-bit integer vector whose low byte will be broadcast. |
Definition at line 3236 of file avx2intrin.h.
Referenced by _mm_mask_broadcastb_epi8(), and _mm_maskz_broadcastb_epi8().
|
static |
Broadcasts the low element from the 128-bit vector of [4 x i32] in __X to all elements of the result's vector of [4 x i32].
This intrinsic corresponds to the VPBROADCASTD
instruction.
__X | A 128-bit vector of [4 x i32] whose low element will be broadcast. |
Definition at line 3268 of file avx2intrin.h.
Referenced by _mm_mask_broadcastd_epi32(), and _mm_maskz_broadcastd_epi32().
|
static |
Broadcasts the low element from the 128-bit vector of [2 x i64] in __X to both elements of the result's 128-bit vector of [2 x i64].
This intrinsic corresponds to the VPBROADCASTQ
instruction.
__X | A 128-bit vector of [2 x i64] whose low element will be broadcast. |
Definition at line 3284 of file avx2intrin.h.
Referenced by _mm_mask_broadcastq_epi64(), and _mm_maskz_broadcastq_epi64().
|
static |
Broadcasts the 64-bit floating-point value from the low element of the 128-bit vector of [2 x double] in __a to both elements of the result's 128-bit vector of [2 x double].
This intrinsic corresponds to the MOVDDUP
instruction.
__a | A 128-bit vector of [2 x double] whose low element will be broadcast. |
Definition at line 3030 of file avx2intrin.h.
References __a.
|
static |
Broadcasts the 32-bit floating-point value from the low element of the 128-bit vector of [4 x float] in __X to all elements of the result's 128-bit vector of [4 x float].
This intrinsic corresponds to the VBROADCASTSS
instruction.
__X | A 128-bit vector of [4 x float] whose low element will be broadcast. |
Definition at line 3013 of file avx2intrin.h.
Referenced by _mm_mask_broadcastss_ps(), and _mm_maskz_broadcastss_ps().
|
static |
Broadcasts the low element from the 128-bit vector of [8 x i16] in __X to all elements of the result's 128-bit vector of [8 x i16].
This intrinsic corresponds to the VPBROADCASTW
instruction.
__X | A 128-bit vector of [8 x i16] whose low element will be broadcast. |
Definition at line 3252 of file avx2intrin.h.
Referenced by _mm_mask_broadcastw_epi16(), and _mm_maskz_broadcastw_epi16().
|
static |
Conditionally loads four 32-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.
Returns the 128-bit [4 x i32] result.
This intrinsic corresponds to the VPMASKMOVD
instruction.
__X | A pointer to the memory used for loading values. |
__M | A 128-bit vector of [4 x i32] containing the mask bits. |
Definition at line 3586 of file avx2intrin.h.
|
static |
Conditionally loads two 64-bit integer elements from memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, sets that element of the result to zero.
Returns the 128-bit [2 x i64] result.
This intrinsic corresponds to the VPMASKMOVQ
instruction.
__X | A pointer to the memory used for loading values. |
__M | A 128-bit vector of [2 x i64] containing the mask bits. |
Definition at line 3618 of file avx2intrin.h.
|
static |
Conditionally stores four 32-bit integer elements from the 128-bit vector of [4 x i32] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.
This intrinsic corresponds to the VPMASKMOVD
instruction.
__X | A pointer to the memory used for storing values. |
__M | A 128-bit vector of [4 x i32] containing the mask bits. |
__Y | A 128-bit vector of [4 x i32] containing the values to store. |
Definition at line 3708 of file avx2intrin.h.
References __Y.
|
static |
Conditionally stores two 64-bit integer elements from the 128-bit vector of [2 x i64] in __Y to memory __X, if the most significant bit of the corresponding element in the mask __M is set; otherwise, the memory element is unchanged.
This intrinsic corresponds to the VPMASKMOVQ
instruction.
__X | A pointer to the memory used for storing values. |
__M | A 128-bit vector of [2 x i64] containing the mask bits. |
__Y | A 128-bit vector of [2 x i64] containing the values to store. |
Definition at line 3738 of file avx2intrin.h.
References __Y.
|
static |
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in zero bits, and returns the result.
If the shift count for any element is greater than 31, the result for that element is zero.
This intrinsic corresponds to the VPSLLVD
instruction.
__X | A 128-bit vector of [4 x i32] to be shifted. |
__Y | A 128-bit vector of [4 x i32] containing the unsigned shift counts (in bits). |
Definition at line 3782 of file avx2intrin.h.
References __Y.
Referenced by _mm_mask_sllv_epi32(), and _mm_maskz_sllv_epi32().
|
static |
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X left by the number of bits given in the corresponding element of the 128-bit vector of [2 x i64] in __Y, shifting in zero bits, and returns the result.
If the shift count for any element is greater than 63, the result for that element is zero.
This intrinsic corresponds to the VPSLLVQ
instruction.
__X | A 128-bit vector of [2 x i64] to be shifted. |
__Y | A 128-bit vector of [2 x i64] containing the unsigned shift counts (in bits). |
Definition at line 3826 of file avx2intrin.h.
References __Y.
Referenced by _mm_mask_sllv_epi64(), and _mm_maskz_sllv_epi64().
|
static |
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in sign bits, and returns the result.
If the shift count for any element is greater than 31, the result for that element is 0 or -1 according to the sign bit for that element.
This intrinsic corresponds to the VPSRAVD
instruction.
__X | A 128-bit vector of [4 x i32] to be shifted. |
__Y | A 128-bit vector of [4 x i32] containing the unsigned shift counts (in bits). |
Definition at line 3872 of file avx2intrin.h.
References __Y.
Referenced by _mm_mask_srav_epi32(), and _mm_maskz_srav_epi32().
|
static |
Shifts each 32-bit element of the 128-bit vector of [4 x i32] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [4 x i32] in __Y, shifting in zero bits, and returns the result.
If the shift count for any element is greater than 31, the result for that element is zero.
This intrinsic corresponds to the VPSRLVD
instruction.
__X | A 128-bit vector of [4 x i32] to be shifted. |
__Y | A 128-bit vector of [4 x i32] containing the unsigned shift counts (in bits). |
Definition at line 3916 of file avx2intrin.h.
References __Y.
Referenced by _mm_mask_srlv_epi32(), and _mm_maskz_srlv_epi32().
|
static |
Shifts each 64-bit element of the 128-bit vector of [2 x i64] in __X right by the number of bits given in the corresponding element of the 128-bit vector of [2 x i64] in __Y, shifting in zero bits, and returns the result.
If the shift count for any element is greater than 63, the result for that element is zero.
This intrinsic corresponds to the VPSRLVQ
instruction.
__X | A 128-bit vector of [2 x i64] to be shifted. |
__Y | A 128-bit vector of [2 x i64] containing the unsigned shift counts (in bits). |
Definition at line 3960 of file avx2intrin.h.
References __Y.
Referenced by _mm_mask_srlv_epi64(), and _mm_maskz_srlv_epi64().