clang 20.0.0git
smmintrin.h
Go to the documentation of this file.
1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __SMMINTRIN_H
11#define __SMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <tmmintrin.h>
18
19/* Define the default attributes for the functions in this file. */
20#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
21#define __DEFAULT_FN_ATTRS \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("sse4.1,no-evex512"), __min_vector_width__(128)))
24#else
25#define __DEFAULT_FN_ATTRS \
26 __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \
27 __min_vector_width__(128)))
28#endif
29
30/* SSE4 Rounding macros. */
31#define _MM_FROUND_TO_NEAREST_INT 0x00
32#define _MM_FROUND_TO_NEG_INF 0x01
33#define _MM_FROUND_TO_POS_INF 0x02
34#define _MM_FROUND_TO_ZERO 0x03
35#define _MM_FROUND_CUR_DIRECTION 0x04
36
37#define _MM_FROUND_RAISE_EXC 0x00
38#define _MM_FROUND_NO_EXC 0x08
39
40#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
41#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
42#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
43#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
44#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
45#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
46
47/// Rounds up each element of the 128-bit vector of [4 x float] to an
48/// integer and returns the rounded values in a 128-bit vector of
49/// [4 x float].
50///
51/// \headerfile <x86intrin.h>
52///
53/// \code
54/// __m128 _mm_ceil_ps(__m128 X);
55/// \endcode
56///
57/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
58///
59/// \param X
60/// A 128-bit vector of [4 x float] values to be rounded up.
61/// \returns A 128-bit vector of [4 x float] containing the rounded values.
62#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
63
64/// Rounds up each element of the 128-bit vector of [2 x double] to an
65/// integer and returns the rounded values in a 128-bit vector of
66/// [2 x double].
67///
68/// \headerfile <x86intrin.h>
69///
70/// \code
71/// __m128d _mm_ceil_pd(__m128d X);
72/// \endcode
73///
74/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
75///
76/// \param X
77/// A 128-bit vector of [2 x double] values to be rounded up.
78/// \returns A 128-bit vector of [2 x double] containing the rounded values.
79#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
80
81/// Copies three upper elements of the first 128-bit vector operand to
82/// the corresponding three upper elements of the 128-bit result vector of
83/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
84/// operand to an integer and copies it to the lowest element of the 128-bit
85/// result vector of [4 x float].
86///
87/// \headerfile <x86intrin.h>
88///
89/// \code
90/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
91/// \endcode
92///
93/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
94///
95/// \param X
96/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
97/// copied to the corresponding bits of the result.
98/// \param Y
99/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
100/// rounded up to the nearest integer and copied to the corresponding bits
101/// of the result.
102/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
103/// values.
104#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
105
106/// Copies the upper element of the first 128-bit vector operand to the
107/// corresponding upper element of the 128-bit result vector of [2 x double].
108/// Rounds up the lower element of the second 128-bit vector operand to an
109/// integer and copies it to the lower element of the 128-bit result vector
110/// of [2 x double].
111///
112/// \headerfile <x86intrin.h>
113///
114/// \code
115/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
116/// \endcode
117///
118/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
119///
120/// \param X
121/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
122/// copied to the corresponding bits of the result.
123/// \param Y
124/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
125/// rounded up to the nearest integer and copied to the corresponding bits
126/// of the result.
127/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
128/// values.
129#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
130
131/// Rounds down each element of the 128-bit vector of [4 x float] to an
132/// an integer and returns the rounded values in a 128-bit vector of
133/// [4 x float].
134///
135/// \headerfile <x86intrin.h>
136///
137/// \code
138/// __m128 _mm_floor_ps(__m128 X);
139/// \endcode
140///
141/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
142///
143/// \param X
144/// A 128-bit vector of [4 x float] values to be rounded down.
145/// \returns A 128-bit vector of [4 x float] containing the rounded values.
146#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
147
148/// Rounds down each element of the 128-bit vector of [2 x double] to an
149/// integer and returns the rounded values in a 128-bit vector of
150/// [2 x double].
151///
152/// \headerfile <x86intrin.h>
153///
154/// \code
155/// __m128d _mm_floor_pd(__m128d X);
156/// \endcode
157///
158/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
159///
160/// \param X
161/// A 128-bit vector of [2 x double].
162/// \returns A 128-bit vector of [2 x double] containing the rounded values.
163#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
164
165/// Copies three upper elements of the first 128-bit vector operand to
166/// the corresponding three upper elements of the 128-bit result vector of
167/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
168/// operand to an integer and copies it to the lowest element of the 128-bit
169/// result vector of [4 x float].
170///
171/// \headerfile <x86intrin.h>
172///
173/// \code
174/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
175/// \endcode
176///
177/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
178///
179/// \param X
180/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
181/// copied to the corresponding bits of the result.
182/// \param Y
183/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
184/// rounded down to the nearest integer and copied to the corresponding bits
185/// of the result.
186/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
187/// values.
188#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
189
190/// Copies the upper element of the first 128-bit vector operand to the
191/// corresponding upper element of the 128-bit result vector of [2 x double].
192/// Rounds down the lower element of the second 128-bit vector operand to an
193/// integer and copies it to the lower element of the 128-bit result vector
194/// of [2 x double].
195///
196/// \headerfile <x86intrin.h>
197///
198/// \code
199/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
200/// \endcode
201///
202/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
203///
204/// \param X
205/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
206/// copied to the corresponding bits of the result.
207/// \param Y
208/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
209/// rounded down to the nearest integer and copied to the corresponding bits
210/// of the result.
211/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
212/// values.
213#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
214
215/// Rounds each element of the 128-bit vector of [4 x float] to an
216/// integer value according to the rounding control specified by the second
217/// argument and returns the rounded values in a 128-bit vector of
218/// [4 x float].
219///
220/// \headerfile <x86intrin.h>
221///
222/// \code
223/// __m128 _mm_round_ps(__m128 X, const int M);
224/// \endcode
225///
226/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
227///
228/// \param X
229/// A 128-bit vector of [4 x float].
230/// \param M
231/// An integer value that specifies the rounding operation. \n
232/// Bits [7:4] are reserved. \n
233/// Bit [3] is a precision exception value: \n
234/// 0: A normal PE exception is used \n
235/// 1: The PE field is not updated \n
236/// Bit [2] is the rounding control source: \n
237/// 0: Use bits [1:0] of \a M \n
238/// 1: Use the current MXCSR setting \n
239/// Bits [1:0] contain the rounding control definition: \n
240/// 00: Nearest \n
241/// 01: Downward (toward negative infinity) \n
242/// 10: Upward (toward positive infinity) \n
243/// 11: Truncated
244/// \returns A 128-bit vector of [4 x float] containing the rounded values.
245#define _mm_round_ps(X, M) \
246 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
247
248/// Copies three upper elements of the first 128-bit vector operand to
249/// the corresponding three upper elements of the 128-bit result vector of
250/// [4 x float]. Rounds the lowest element of the second 128-bit vector
251/// operand to an integer value according to the rounding control specified
252/// by the third argument and copies it to the lowest element of the 128-bit
253/// result vector of [4 x float].
254///
255/// \headerfile <x86intrin.h>
256///
257/// \code
258/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
259/// \endcode
260///
261/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
262///
263/// \param X
264/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
265/// copied to the corresponding bits of the result.
266/// \param Y
267/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
268/// rounded to the nearest integer using the specified rounding control and
269/// copied to the corresponding bits of the result.
270/// \param M
271/// An integer value that specifies the rounding operation. \n
272/// Bits [7:4] are reserved. \n
273/// Bit [3] is a precision exception value: \n
274/// 0: A normal PE exception is used \n
275/// 1: The PE field is not updated \n
276/// Bit [2] is the rounding control source: \n
277/// 0: Use bits [1:0] of \a M \n
278/// 1: Use the current MXCSR setting \n
279/// Bits [1:0] contain the rounding control definition: \n
280/// 00: Nearest \n
281/// 01: Downward (toward negative infinity) \n
282/// 10: Upward (toward positive infinity) \n
283/// 11: Truncated
284/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
285/// values.
286#define _mm_round_ss(X, Y, M) \
287 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
288 (M)))
289
290/// Rounds each element of the 128-bit vector of [2 x double] to an
291/// integer value according to the rounding control specified by the second
292/// argument and returns the rounded values in a 128-bit vector of
293/// [2 x double].
294///
295/// \headerfile <x86intrin.h>
296///
297/// \code
298/// __m128d _mm_round_pd(__m128d X, const int M);
299/// \endcode
300///
301/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
302///
303/// \param X
304/// A 128-bit vector of [2 x double].
305/// \param M
306/// An integer value that specifies the rounding operation. \n
307/// Bits [7:4] are reserved. \n
308/// Bit [3] is a precision exception value: \n
309/// 0: A normal PE exception is used \n
310/// 1: The PE field is not updated \n
311/// Bit [2] is the rounding control source: \n
312/// 0: Use bits [1:0] of \a M \n
313/// 1: Use the current MXCSR setting \n
314/// Bits [1:0] contain the rounding control definition: \n
315/// 00: Nearest \n
316/// 01: Downward (toward negative infinity) \n
317/// 10: Upward (toward positive infinity) \n
318/// 11: Truncated
319/// \returns A 128-bit vector of [2 x double] containing the rounded values.
320#define _mm_round_pd(X, M) \
321 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
322
323/// Copies the upper element of the first 128-bit vector operand to the
324/// corresponding upper element of the 128-bit result vector of [2 x double].
325/// Rounds the lower element of the second 128-bit vector operand to an
326/// integer value according to the rounding control specified by the third
327/// argument and copies it to the lower element of the 128-bit result vector
328/// of [2 x double].
329///
330/// \headerfile <x86intrin.h>
331///
332/// \code
333/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
334/// \endcode
335///
336/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
337///
338/// \param X
339/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
340/// copied to the corresponding bits of the result.
341/// \param Y
342/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
343/// rounded to the nearest integer using the specified rounding control and
344/// copied to the corresponding bits of the result.
345/// \param M
346/// An integer value that specifies the rounding operation. \n
347/// Bits [7:4] are reserved. \n
348/// Bit [3] is a precision exception value: \n
349/// 0: A normal PE exception is used \n
350/// 1: The PE field is not updated \n
351/// Bit [2] is the rounding control source: \n
352/// 0: Use bits [1:0] of \a M \n
353/// 1: Use the current MXCSR setting \n
354/// Bits [1:0] contain the rounding control definition: \n
355/// 00: Nearest \n
356/// 01: Downward (toward negative infinity) \n
357/// 10: Upward (toward positive infinity) \n
358/// 11: Truncated
359/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
360/// values.
361#define _mm_round_sd(X, Y, M) \
362 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
363 (M)))
364
365/* SSE4 Packed Blending Intrinsics. */
366/// Returns a 128-bit vector of [2 x double] where the values are
367/// selected from either the first or second operand as specified by the
368/// third operand, the control mask.
369///
370/// \headerfile <x86intrin.h>
371///
372/// \code
373/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
374/// \endcode
375///
376/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
377///
378/// \param V1
379/// A 128-bit vector of [2 x double].
380/// \param V2
381/// A 128-bit vector of [2 x double].
382/// \param M
383/// An immediate integer operand, with mask bits [1:0] specifying how the
384/// values are to be copied. The position of the mask bit corresponds to the
385/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
386/// element in operand \a V1 is copied to the same position in the result.
387/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
388/// is copied to the same position in the result.
389/// \returns A 128-bit vector of [2 x double] containing the copied values.
390#define _mm_blend_pd(V1, V2, M) \
391 ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \
392 (__v2df)(__m128d)(V2), (int)(M)))
393
394/// Returns a 128-bit vector of [4 x float] where the values are selected
395/// from either the first or second operand as specified by the third
396/// operand, the control mask.
397///
398/// \headerfile <x86intrin.h>
399///
400/// \code
401/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
402/// \endcode
403///
404/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
405///
406/// \param V1
407/// A 128-bit vector of [4 x float].
408/// \param V2
409/// A 128-bit vector of [4 x float].
410/// \param M
411/// An immediate integer operand, with mask bits [3:0] specifying how the
412/// values are to be copied. The position of the mask bit corresponds to the
413/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
414/// element in operand \a V1 is copied to the same position in the result.
415/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
416/// is copied to the same position in the result.
417/// \returns A 128-bit vector of [4 x float] containing the copied values.
418#define _mm_blend_ps(V1, V2, M) \
419 ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
420 (int)(M)))
421
422/// Returns a 128-bit vector of [2 x double] where the values are
423/// selected from either the first or second operand as specified by the
424/// third operand, the control mask.
425///
426/// \headerfile <x86intrin.h>
427///
428/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
429///
430/// \param __V1
431/// A 128-bit vector of [2 x double].
432/// \param __V2
433/// A 128-bit vector of [2 x double].
434/// \param __M
435/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
436/// values are to be copied. The position of the mask bit corresponds to the
437/// most significant bit of a copied value. When a mask bit is 0, the
438/// corresponding 64-bit element in operand \a __V1 is copied to the same
439/// position in the result. When a mask bit is 1, the corresponding 64-bit
440/// element in operand \a __V2 is copied to the same position in the result.
441/// \returns A 128-bit vector of [2 x double] containing the copied values.
442static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
443 __m128d __V2,
444 __m128d __M) {
445 return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
446 (__v2df)__M);
447}
448
449/// Returns a 128-bit vector of [4 x float] where the values are
450/// selected from either the first or second operand as specified by the
451/// third operand, the control mask.
452///
453/// \headerfile <x86intrin.h>
454///
455/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
456///
457/// \param __V1
458/// A 128-bit vector of [4 x float].
459/// \param __V2
460/// A 128-bit vector of [4 x float].
461/// \param __M
462/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
463/// how the values are to be copied. The position of the mask bit corresponds
464/// to the most significant bit of a copied value. When a mask bit is 0, the
465/// corresponding 32-bit element in operand \a __V1 is copied to the same
466/// position in the result. When a mask bit is 1, the corresponding 32-bit
467/// element in operand \a __V2 is copied to the same position in the result.
468/// \returns A 128-bit vector of [4 x float] containing the copied values.
469static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
470 __m128 __V2,
471 __m128 __M) {
472 return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
473 (__v4sf)__M);
474}
475
476/// Returns a 128-bit vector of [16 x i8] where the values are selected
477/// from either of the first or second operand as specified by the third
478/// operand, the control mask.
479///
480/// \headerfile <x86intrin.h>
481///
482/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
483///
484/// \param __V1
485/// A 128-bit vector of [16 x i8].
486/// \param __V2
487/// A 128-bit vector of [16 x i8].
488/// \param __M
489/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
490/// how the values are to be copied. The position of the mask bit corresponds
491/// to the most significant bit of a copied value. When a mask bit is 0, the
492/// corresponding 8-bit element in operand \a __V1 is copied to the same
493/// position in the result. When a mask bit is 1, the corresponding 8-bit
494/// element in operand \a __V2 is copied to the same position in the result.
495/// \returns A 128-bit vector of [16 x i8] containing the copied values.
496static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
497 __m128i __V2,
498 __m128i __M) {
499 return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
500 (__v16qi)__M);
501}
502
503/// Returns a 128-bit vector of [8 x i16] where the values are selected
504/// from either of the first or second operand as specified by the third
505/// operand, the control mask.
506///
507/// \headerfile <x86intrin.h>
508///
509/// \code
510/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
511/// \endcode
512///
513/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
514///
515/// \param V1
516/// A 128-bit vector of [8 x i16].
517/// \param V2
518/// A 128-bit vector of [8 x i16].
519/// \param M
520/// An immediate integer operand, with mask bits [7:0] specifying how the
521/// values are to be copied. The position of the mask bit corresponds to the
522/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
523/// element in operand \a V1 is copied to the same position in the result.
524/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
525/// is copied to the same position in the result.
526/// \returns A 128-bit vector of [8 x i16] containing the copied values.
527#define _mm_blend_epi16(V1, V2, M) \
528 ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \
529 (__v8hi)(__m128i)(V2), (int)(M)))
530
531/* SSE4 Dword Multiply Instructions. */
532/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
533/// and returns the lower 32 bits of the each product in a 128-bit vector of
534/// [4 x i32].
535///
536/// \headerfile <x86intrin.h>
537///
538/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
539///
540/// \param __V1
541/// A 128-bit integer vector.
542/// \param __V2
543/// A 128-bit integer vector.
544/// \returns A 128-bit integer vector containing the products of both operands.
545static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
546 __m128i __V2) {
547 return (__m128i)((__v4su)__V1 * (__v4su)__V2);
548}
549
550/// Multiplies corresponding even-indexed elements of two 128-bit
551/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
552/// containing the products.
553///
554/// \headerfile <x86intrin.h>
555///
556/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
557///
558/// \param __V1
559/// A 128-bit vector of [4 x i32].
560/// \param __V2
561/// A 128-bit vector of [4 x i32].
562/// \returns A 128-bit vector of [2 x i64] containing the products of both
563/// operands.
564static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
565 __m128i __V2) {
566 return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
567}
568
569/* SSE4 Floating Point Dot Product Instructions. */
570/// Computes the dot product of the two 128-bit vectors of [4 x float]
571/// and returns it in the elements of the 128-bit result vector of
572/// [4 x float].
573///
574/// The immediate integer operand controls which input elements
575/// will contribute to the dot product, and where the final results are
576/// returned.
577///
578/// \headerfile <x86intrin.h>
579///
580/// \code
581/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
582/// \endcode
583///
584/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
585///
586/// \param X
587/// A 128-bit vector of [4 x float].
588/// \param Y
589/// A 128-bit vector of [4 x float].
590/// \param M
591/// An immediate integer operand. Mask bits [7:4] determine which elements
592/// of the input vectors are used, with bit [4] corresponding to the lowest
593/// element and bit [7] corresponding to the highest element of each [4 x
594/// float] vector. If a bit is set, the corresponding elements from the two
595/// input vectors are used as an input for dot product; otherwise that input
596/// is treated as zero. Bits [3:0] determine which elements of the result
597/// will receive a copy of the final dot product, with bit [0] corresponding
598/// to the lowest element and bit [3] corresponding to the highest element of
599/// each [4 x float] subvector. If a bit is set, the dot product is returned
600/// in the corresponding element; otherwise that element is set to zero.
601/// \returns A 128-bit vector of [4 x float] containing the dot product.
602#define _mm_dp_ps(X, Y, M) \
603 ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
604
605/// Computes the dot product of the two 128-bit vectors of [2 x double]
606/// and returns it in the elements of the 128-bit result vector of
607/// [2 x double].
608///
609/// The immediate integer operand controls which input
610/// elements will contribute to the dot product, and where the final results
611/// are returned.
612///
613/// \headerfile <x86intrin.h>
614///
615/// \code
616/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
617/// \endcode
618///
619/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
620///
621/// \param X
622/// A 128-bit vector of [2 x double].
623/// \param Y
624/// A 128-bit vector of [2 x double].
625/// \param M
626/// An immediate integer operand. Mask bits [5:4] determine which elements
627/// of the input vectors are used, with bit [4] corresponding to the lowest
628/// element and bit [5] corresponding to the highest element of each of [2 x
629/// double] vector. If a bit is set, the corresponding elements from the two
630/// input vectors are used as an input for dot product; otherwise that input
631/// is treated as zero. Bits [1:0] determine which elements of the result
632/// will receive a copy of the final dot product, with bit [0] corresponding
633/// to the lowest element and bit [1] corresponding to the highest element of
634/// each [2 x double] vector. If a bit is set, the dot product is returned in
635/// the corresponding element; otherwise that element is set to zero.
636#define _mm_dp_pd(X, Y, M) \
637 ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
638 (M)))
639
640/* SSE4 Streaming Load Hint Instruction. */
641/// Loads integer values from a 128-bit aligned memory location to a
642/// 128-bit integer vector.
643///
644/// \headerfile <x86intrin.h>
645///
646/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
647///
648/// \param __V
649/// A pointer to a 128-bit aligned memory location that contains the integer
650/// values.
651/// \returns A 128-bit integer vector containing the data stored at the
652/// specified memory location.
653static __inline__ __m128i __DEFAULT_FN_ATTRS
654_mm_stream_load_si128(const void *__V) {
655 return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
656}
657
658/* SSE4 Packed Integer Min/Max Instructions. */
659/// Compares the corresponding elements of two 128-bit vectors of
660/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
661/// of the two values.
662///
663/// \headerfile <x86intrin.h>
664///
665/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
666///
667/// \param __V1
668/// A 128-bit vector of [16 x i8].
669/// \param __V2
670/// A 128-bit vector of [16 x i8]
671/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
672static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
673 __m128i __V2) {
674 return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
675}
676
677/// Compares the corresponding elements of two 128-bit vectors of
678/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
679/// greater value of the two.
680///
681/// \headerfile <x86intrin.h>
682///
683/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
684///
685/// \param __V1
686/// A 128-bit vector of [16 x i8].
687/// \param __V2
688/// A 128-bit vector of [16 x i8].
689/// \returns A 128-bit vector of [16 x i8] containing the greater values.
690static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
691 __m128i __V2) {
692 return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
693}
694
695/// Compares the corresponding elements of two 128-bit vectors of
696/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
697/// value of the two.
698///
699/// \headerfile <x86intrin.h>
700///
701/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
702///
703/// \param __V1
704/// A 128-bit vector of [8 x u16].
705/// \param __V2
706/// A 128-bit vector of [8 x u16].
707/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
708static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
709 __m128i __V2) {
710 return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
711}
712
713/// Compares the corresponding elements of two 128-bit vectors of
714/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
715/// greater value of the two.
716///
717/// \headerfile <x86intrin.h>
718///
719/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
720///
721/// \param __V1
722/// A 128-bit vector of [8 x u16].
723/// \param __V2
724/// A 128-bit vector of [8 x u16].
725/// \returns A 128-bit vector of [8 x u16] containing the greater values.
726static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
727 __m128i __V2) {
728 return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
729}
730
731/// Compares the corresponding elements of two 128-bit vectors of
732/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
733/// value of the two.
734///
735/// \headerfile <x86intrin.h>
736///
737/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
738///
739/// \param __V1
740/// A 128-bit vector of [4 x i32].
741/// \param __V2
742/// A 128-bit vector of [4 x i32].
743/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
744static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
745 __m128i __V2) {
746 return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
747}
748
749/// Compares the corresponding elements of two 128-bit vectors of
750/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
751/// greater value of the two.
752///
753/// \headerfile <x86intrin.h>
754///
755/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
756///
757/// \param __V1
758/// A 128-bit vector of [4 x i32].
759/// \param __V2
760/// A 128-bit vector of [4 x i32].
761/// \returns A 128-bit vector of [4 x i32] containing the greater values.
762static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
763 __m128i __V2) {
764 return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
765}
766
767/// Compares the corresponding elements of two 128-bit vectors of
768/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
769/// value of the two.
770///
771/// \headerfile <x86intrin.h>
772///
773/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
774///
775/// \param __V1
776/// A 128-bit vector of [4 x u32].
777/// \param __V2
778/// A 128-bit vector of [4 x u32].
779/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
780static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
781 __m128i __V2) {
782 return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
783}
784
785/// Compares the corresponding elements of two 128-bit vectors of
786/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
787/// greater value of the two.
788///
789/// \headerfile <x86intrin.h>
790///
791/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
792///
793/// \param __V1
794/// A 128-bit vector of [4 x u32].
795/// \param __V2
796/// A 128-bit vector of [4 x u32].
797/// \returns A 128-bit vector of [4 x u32] containing the greater values.
798static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
799 __m128i __V2) {
800 return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
801}
802
803/* SSE4 Insertion and Extraction from XMM Register Instructions. */
804/// Takes the first argument \a X and inserts an element from the second
805/// argument \a Y as selected by the third argument \a N. That result then
806/// has elements zeroed out also as selected by the third argument \a N. The
807/// resulting 128-bit vector of [4 x float] is then returned.
808///
809/// \headerfile <x86intrin.h>
810///
811/// \code
812/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
813/// \endcode
814///
815/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
816///
817/// \param X
818/// A 128-bit vector source operand of [4 x float]. With the exception of
819/// those bits in the result copied from parameter \a Y and zeroed by bits
820/// [3:0] of \a N, all bits from this parameter are copied to the result.
821/// \param Y
822/// A 128-bit vector source operand of [4 x float]. One single-precision
823/// floating-point element from this source, as determined by the immediate
824/// parameter, is copied to the result.
825/// \param N
826/// Specifies which bits from operand \a Y will be copied, which bits in the
827/// result they will be copied to, and which bits in the result will be
828/// cleared. The following assignments are made: \n
829/// Bits [7:6] specify the bits to copy from operand \a Y: \n
830/// 00: Selects bits [31:0] from operand \a Y. \n
831/// 01: Selects bits [63:32] from operand \a Y. \n
832/// 10: Selects bits [95:64] from operand \a Y. \n
833/// 11: Selects bits [127:96] from operand \a Y. \n
834/// Bits [5:4] specify the bits in the result to which the selected bits
835/// from operand \a Y are copied: \n
836/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
837/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
838/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
839/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
840/// Bits[3:0]: If any of these bits are set, the corresponding result
841/// element is cleared.
842/// \returns A 128-bit vector of [4 x float] containing the copied
843/// single-precision floating point elements from the operands.
844#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
845
846/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
847/// returns it, using the immediate value parameter \a N as a selector.
848///
849/// \headerfile <x86intrin.h>
850///
851/// \code
852/// int _mm_extract_ps(__m128 X, const int N);
853/// \endcode
854///
855/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
856/// instruction.
857///
858/// \param X
859/// A 128-bit vector of [4 x float].
860/// \param N
861/// An immediate value. Bits [1:0] determines which bits from the argument
862/// \a X are extracted and returned: \n
863/// 00: Bits [31:0] of parameter \a X are returned. \n
864/// 01: Bits [63:32] of parameter \a X are returned. \n
865/// 10: Bits [95:64] of parameter \a X are returned. \n
866/// 11: Bits [127:96] of parameter \a X are returned.
867/// \returns A 32-bit integer containing the extracted 32 bits of float data.
868#define _mm_extract_ps(X, N) \
869 __builtin_bit_cast( \
870 int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
871
872/* Miscellaneous insert and extract macros. */
873/* Extract a single-precision float from X at index N into D. */
874#define _MM_EXTRACT_FLOAT(D, X, N) \
875 do { \
876 (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
877 } while (0)
878
879/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
880 an index suitable for _mm_insert_ps. */
881#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
882
883/* Extract a float from X at index N into the first index of the return. */
884#define _MM_PICK_OUT_PS(X, N) \
885 _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
886
887/* Insert int into packed integer array at index. */
888/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
889/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
890/// of an integer parameter \a I into an offset specified by the immediate
891/// value parameter \a N.
892///
893/// \headerfile <x86intrin.h>
894///
895/// \code
896/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
897/// \endcode
898///
899/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
900///
901/// \param X
902/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
903/// result and then one of the sixteen elements in the result vector is
904/// replaced by the lower 8 bits of \a I.
905/// \param I
906/// An integer. The lower 8 bits of this operand are written to the result
907/// beginning at the offset specified by \a N.
908/// \param N
909/// An immediate value. Bits [3:0] specify the bit offset in the result at
910/// which the lower 8 bits of \a I are written. \n
911/// 0000: Bits [7:0] of the result are used for insertion. \n
912/// 0001: Bits [15:8] of the result are used for insertion. \n
913/// 0010: Bits [23:16] of the result are used for insertion. \n
914/// 0011: Bits [31:24] of the result are used for insertion. \n
915/// 0100: Bits [39:32] of the result are used for insertion. \n
916/// 0101: Bits [47:40] of the result are used for insertion. \n
917/// 0110: Bits [55:48] of the result are used for insertion. \n
918/// 0111: Bits [63:56] of the result are used for insertion. \n
919/// 1000: Bits [71:64] of the result are used for insertion. \n
920/// 1001: Bits [79:72] of the result are used for insertion. \n
921/// 1010: Bits [87:80] of the result are used for insertion. \n
922/// 1011: Bits [95:88] of the result are used for insertion. \n
923/// 1100: Bits [103:96] of the result are used for insertion. \n
924/// 1101: Bits [111:104] of the result are used for insertion. \n
925/// 1110: Bits [119:112] of the result are used for insertion. \n
926/// 1111: Bits [127:120] of the result are used for insertion.
927/// \returns A 128-bit integer vector containing the constructed values.
928#define _mm_insert_epi8(X, I, N) \
929 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \
930 (int)(N)))
931
932/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
933/// the 128-bit integer vector parameter, and then inserting the 32-bit
934/// integer parameter \a I at the offset specified by the immediate value
935/// parameter \a N.
936///
937/// \headerfile <x86intrin.h>
938///
939/// \code
940/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
941/// \endcode
942///
943/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
944///
945/// \param X
946/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
947/// result and then one of the four elements in the result vector is
948/// replaced by \a I.
949/// \param I
950/// A 32-bit integer that is written to the result beginning at the offset
951/// specified by \a N.
952/// \param N
953/// An immediate value. Bits [1:0] specify the bit offset in the result at
954/// which the integer \a I is written. \n
955/// 00: Bits [31:0] of the result are used for insertion. \n
956/// 01: Bits [63:32] of the result are used for insertion. \n
957/// 10: Bits [95:64] of the result are used for insertion. \n
958/// 11: Bits [127:96] of the result are used for insertion.
959/// \returns A 128-bit integer vector containing the constructed values.
960#define _mm_insert_epi32(X, I, N) \
961 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \
962 (int)(N)))
963
964#ifdef __x86_64__
965/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
966/// the 128-bit integer vector parameter, and then inserting the 64-bit
967/// integer parameter \a I, using the immediate value parameter \a N as an
968/// insertion location selector.
969///
970/// \headerfile <x86intrin.h>
971///
972/// \code
973/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
974/// \endcode
975///
976/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
977///
978/// \param X
979/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
980/// result and then one of the two elements in the result vector is replaced
981/// by \a I.
982/// \param I
983/// A 64-bit integer that is written to the result beginning at the offset
984/// specified by \a N.
985/// \param N
986/// An immediate value. Bit [0] specifies the bit offset in the result at
987/// which the integer \a I is written. \n
988/// 0: Bits [63:0] of the result are used for insertion. \n
989/// 1: Bits [127:64] of the result are used for insertion. \n
990/// \returns A 128-bit integer vector containing the constructed values.
991#define _mm_insert_epi64(X, I, N) \
992 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \
993 (int)(N)))
994#endif /* __x86_64__ */
995
996/* Extract int from packed integer array at index. This returns the element
997 * as a zero extended value, so it is unsigned.
998 */
999/// Extracts an 8-bit element from the 128-bit integer vector of
1000/// [16 x i8], using the immediate value parameter \a N as a selector.
1001///
1002/// \headerfile <x86intrin.h>
1003///
1004/// \code
1005/// int _mm_extract_epi8(__m128i X, const int N);
1006/// \endcode
1007///
1008/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1009///
1010/// \param X
1011/// A 128-bit integer vector.
1012/// \param N
1013/// An immediate value. Bits [3:0] specify which 8-bit vector element from
1014/// the argument \a X to extract and copy to the result. \n
1015/// 0000: Bits [7:0] of parameter \a X are extracted. \n
1016/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1017/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1018/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1019/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1020/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1021/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1022/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1023/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1024/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1025/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1026/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1027/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1028/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1029/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1030/// 1111: Bits [127:120] of the parameter \a X are extracted.
1031/// \returns An unsigned integer, whose lower 8 bits are selected from the
1032/// 128-bit integer vector parameter and the remaining bits are assigned
1033/// zeros.
1034#define _mm_extract_epi8(X, N) \
1035 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
1036 (int)(N)))
1037
1038/// Extracts a 32-bit element from the 128-bit integer vector of
1039/// [4 x i32], using the immediate value parameter \a N as a selector.
1040///
1041/// \headerfile <x86intrin.h>
1042///
1043/// \code
1044/// int _mm_extract_epi32(__m128i X, const int N);
1045/// \endcode
1046///
1047/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1048///
1049/// \param X
1050/// A 128-bit integer vector.
1051/// \param N
1052/// An immediate value. Bits [1:0] specify which 32-bit vector element from
1053/// the argument \a X to extract and copy to the result. \n
1054/// 00: Bits [31:0] of the parameter \a X are extracted. \n
1055/// 01: Bits [63:32] of the parameter \a X are extracted. \n
1056/// 10: Bits [95:64] of the parameter \a X are extracted. \n
1057/// 11: Bits [127:96] of the parameter \a X are exracted.
1058/// \returns An integer, whose lower 32 bits are selected from the 128-bit
1059/// integer vector parameter and the remaining bits are assigned zeros.
1060#define _mm_extract_epi32(X, N) \
1061 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
1062
1063/// Extracts a 64-bit element from the 128-bit integer vector of
1064/// [2 x i64], using the immediate value parameter \a N as a selector.
1065///
1066/// \headerfile <x86intrin.h>
1067///
1068/// \code
1069/// long long _mm_extract_epi64(__m128i X, const int N);
1070/// \endcode
1071///
1072/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
1073/// in 64-bit mode.
1074///
1075/// \param X
1076/// A 128-bit integer vector.
1077/// \param N
1078/// An immediate value. Bit [0] specifies which 64-bit vector element from
1079/// the argument \a X to return. \n
1080/// 0: Bits [63:0] are returned. \n
1081/// 1: Bits [127:64] are returned. \n
1082/// \returns A 64-bit integer.
1083#define _mm_extract_epi64(X, N) \
1084 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
1085
1086/* SSE4 128-bit Packed Integer Comparisons. */
1087/// Tests whether the specified bits in a 128-bit integer vector are all
1088/// zeros.
1089///
1090/// \headerfile <x86intrin.h>
1091///
1092/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1093///
1094/// \param __M
1095/// A 128-bit integer vector containing the bits to be tested.
1096/// \param __V
1097/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1098/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1099static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
1100 __m128i __V) {
1101 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1102}
1103
1104/// Tests whether the specified bits in a 128-bit integer vector are all
1105/// ones.
1106///
1107/// \headerfile <x86intrin.h>
1108///
1109/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1110///
1111/// \param __M
1112/// A 128-bit integer vector containing the bits to be tested.
1113/// \param __V
1114/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1115/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1116static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
1117 __m128i __V) {
1118 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1119}
1120
1121/// Tests whether the specified bits in a 128-bit integer vector are
1122/// neither all zeros nor all ones.
1123///
1124/// \headerfile <x86intrin.h>
1125///
1126/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1127///
1128/// \param __M
1129/// A 128-bit integer vector containing the bits to be tested.
1130/// \param __V
1131/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1132/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1133/// FALSE otherwise.
1134static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
1135 __m128i __V) {
1136 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1137}
1138
1139/// Tests whether the specified bits in a 128-bit integer vector are all
1140/// ones.
1141///
1142/// \headerfile <x86intrin.h>
1143///
1144/// \code
1145/// int _mm_test_all_ones(__m128i V);
1146/// \endcode
1147///
1148/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1149///
1150/// \param V
1151/// A 128-bit integer vector containing the bits to be tested.
1152/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1153/// otherwise.
1154#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
1155
1156/// Tests whether the specified bits in a 128-bit integer vector are
1157/// neither all zeros nor all ones.
1158///
1159/// \headerfile <x86intrin.h>
1160///
1161/// \code
1162/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1163/// \endcode
1164///
1165/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1166///
1167/// \param M
1168/// A 128-bit integer vector containing the bits to be tested.
1169/// \param V
1170/// A 128-bit integer vector selecting which bits to test in operand \a M.
1171/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1172/// FALSE otherwise.
1173#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1174
1175/// Tests whether the specified bits in a 128-bit integer vector are all
1176/// zeros.
1177///
1178/// \headerfile <x86intrin.h>
1179///
1180/// \code
1181/// int _mm_test_all_zeros(__m128i M, __m128i V);
1182/// \endcode
1183///
1184/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1185///
1186/// \param M
1187/// A 128-bit integer vector containing the bits to be tested.
1188/// \param V
1189/// A 128-bit integer vector selecting which bits to test in operand \a M.
1190/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1191#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
1192
1193/* SSE4 64-bit Packed Integer Comparisons. */
1194/// Compares each of the corresponding 64-bit values of the 128-bit
1195/// integer vectors for equality.
1196///
1197/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1198///
1199/// \headerfile <x86intrin.h>
1200///
1201/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1202///
1203/// \param __V1
1204/// A 128-bit integer vector.
1205/// \param __V2
1206/// A 128-bit integer vector.
1207/// \returns A 128-bit integer vector containing the comparison results.
1208static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
1209 __m128i __V2) {
1210 return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1211}
1212
1213/* SSE4 Packed Integer Sign-Extension. */
1214/// Sign-extends each of the lower eight 8-bit integer elements of a
1215/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1216/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1217/// are unused.
1218///
1219/// \headerfile <x86intrin.h>
1220///
1221/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1222///
1223/// \param __V
1224/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1225/// sign-extended to 16-bit values.
1226/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1227static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
1228 /* This function always performs a signed extension, but __v16qi is a char
1229 which may be signed or unsigned, so use __v16qs. */
1230 return (__m128i) __builtin_convertvector(
1231 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
1232 7),
1233 __v8hi);
1234}
1235
1236/// Sign-extends each of the lower four 8-bit integer elements of a
1237/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1238/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1239/// vector are unused.
1240///
1241/// \headerfile <x86intrin.h>
1242///
1243/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1244///
1245/// \param __V
1246/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1247/// sign-extended to 32-bit values.
1248/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1249static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
1250 /* This function always performs a signed extension, but __v16qi is a char
1251 which may be signed or unsigned, so use __v16qs. */
1252 return (__m128i) __builtin_convertvector(
1253 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1254}
1255
1256/// Sign-extends each of the lower two 8-bit integer elements of a
1257/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1258/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1259/// vector are unused.
1260///
1261/// \headerfile <x86intrin.h>
1262///
1263/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1264///
1265/// \param __V
1266/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1267/// sign-extended to 64-bit values.
1268/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1269static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
1270 /* This function always performs a signed extension, but __v16qi is a char
1271 which may be signed or unsigned, so use __v16qs. */
1272 return (__m128i) __builtin_convertvector(
1273 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1274}
1275
1276/// Sign-extends each of the lower four 16-bit integer elements of a
1277/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1278/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1279/// vector are unused.
1280///
1281/// \headerfile <x86intrin.h>
1282///
1283/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1284///
1285/// \param __V
1286/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1287/// sign-extended to 32-bit values.
1288/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1289static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
1290 return (__m128i) __builtin_convertvector(
1291 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1292}
1293
1294/// Sign-extends each of the lower two 16-bit integer elements of a
1295/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1296/// a 128-bit vector of [2 x i64]. The upper six elements of the input
1297/// vector are unused.
1298///
1299/// \headerfile <x86intrin.h>
1300///
1301/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1302///
1303/// \param __V
1304/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1305/// sign-extended to 64-bit values.
1306/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1307static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
1308 return (__m128i) __builtin_convertvector(
1309 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1310}
1311
1312/// Sign-extends each of the lower two 32-bit integer elements of a
1313/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1314/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1315/// are unused.
1316///
1317/// \headerfile <x86intrin.h>
1318///
1319/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1320///
1321/// \param __V
1322/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1323/// sign-extended to 64-bit values.
1324/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1325static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
1326 return (__m128i) __builtin_convertvector(
1327 __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1328}
1329
1330/* SSE4 Packed Integer Zero-Extension. */
1331/// Zero-extends each of the lower eight 8-bit integer elements of a
1332/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1333/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1334/// are unused.
1335///
1336/// \headerfile <x86intrin.h>
1337///
1338/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1339///
1340/// \param __V
1341/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1342/// zero-extended to 16-bit values.
1343/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1344static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
1345 return (__m128i) __builtin_convertvector(
1346 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
1347 7),
1348 __v8hi);
1349}
1350
1351/// Zero-extends each of the lower four 8-bit integer elements of a
1352/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1353/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1354/// vector are unused.
1355///
1356/// \headerfile <x86intrin.h>
1357///
1358/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1359///
1360/// \param __V
1361/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1362/// zero-extended to 32-bit values.
1363/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1364static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
1365 return (__m128i) __builtin_convertvector(
1366 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1367}
1368
1369/// Zero-extends each of the lower two 8-bit integer elements of a
1370/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1371/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1372/// vector are unused.
1373///
1374/// \headerfile <x86intrin.h>
1375///
1376/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1377///
1378/// \param __V
1379/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1380/// zero-extended to 64-bit values.
1381/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1382static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
1383 return (__m128i) __builtin_convertvector(
1384 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1385}
1386
1387/// Zero-extends each of the lower four 16-bit integer elements of a
1388/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1389/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1390/// vector are unused.
1391///
1392/// \headerfile <x86intrin.h>
1393///
1394/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1395///
1396/// \param __V
1397/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1398/// zero-extended to 32-bit values.
1399/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1400static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
1401 return (__m128i) __builtin_convertvector(
1402 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1403}
1404
1405/// Zero-extends each of the lower two 16-bit integer elements of a
1406/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1407/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1408/// are unused.
1409///
1410/// \headerfile <x86intrin.h>
1411///
1412/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1413///
1414/// \param __V
1415/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1416/// zero-extended to 64-bit values.
1417/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1418static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
1419 return (__m128i) __builtin_convertvector(
1420 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1421}
1422
1423/// Zero-extends each of the lower two 32-bit integer elements of a
1424/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1425/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1426/// are unused.
1427///
1428/// \headerfile <x86intrin.h>
1429///
1430/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1431///
1432/// \param __V
1433/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1434/// zero-extended to 64-bit values.
1435/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1436static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
1437 return (__m128i) __builtin_convertvector(
1438 __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1439}
1440
1441/* SSE4 Pack with Unsigned Saturation. */
1442/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
1443/// vector operands into 16-bit unsigned integers, and returns the packed
1444/// result.
1445///
1446/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1447/// 0x0000 are saturated to 0x0000.
1448///
1449/// \headerfile <x86intrin.h>
1450///
1451/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1452///
1453/// \param __V1
1454/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1455/// written to the lower 64 bits of the result.
1456/// \param __V2
1457/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1458/// written to the higher 64 bits of the result.
1459/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1460static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
1461 __m128i __V2) {
1462 return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1463}
1464
1465/* SSE4 Multiple Packed Sums of Absolute Difference. */
1466/// Subtracts 8-bit unsigned integer values and computes the absolute
1467/// values of the differences to the corresponding bits in the destination.
1468/// Then sums of the absolute differences are returned according to the bit
1469/// fields in the immediate operand.
1470///
1471/// \headerfile <x86intrin.h>
1472///
1473/// \code
1474/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1475/// \endcode
1476///
1477/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1478///
1479/// \param X
1480/// A 128-bit vector of [16 x i8].
1481/// \param Y
1482/// A 128-bit vector of [16 x i8].
1483/// \param M
1484/// An 8-bit immediate operand specifying how the absolute differences are to
1485/// be calculated, according to the following algorithm:
1486/// \code
1487/// // M2 represents bit 2 of the immediate operand
1488/// // M10 represents bits [1:0] of the immediate operand
1489/// i = M2 * 4;
1490/// j = M10 * 4;
1491/// for (k = 0; k < 8; k = k + 1) {
1492/// d0 = abs(X[i + k + 0] - Y[j + 0]);
1493/// d1 = abs(X[i + k + 1] - Y[j + 1]);
1494/// d2 = abs(X[i + k + 2] - Y[j + 2]);
1495/// d3 = abs(X[i + k + 3] - Y[j + 3]);
1496/// r[k] = d0 + d1 + d2 + d3;
1497/// }
1498/// \endcode
1499/// \returns A 128-bit integer vector containing the sums of the sets of
1500/// absolute differences between both operands.
1501#define _mm_mpsadbw_epu8(X, Y, M) \
1502 ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1503 (__v16qi)(__m128i)(Y), (M)))
1504
1505/// Finds the minimum unsigned 16-bit element in the input 128-bit
1506/// vector of [8 x u16] and returns it and along with its index.
1507///
1508/// \headerfile <x86intrin.h>
1509///
1510/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1511/// instruction.
1512///
1513/// \param __V
1514/// A 128-bit vector of [8 x u16].
1515/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1516/// in parameter \a __V, bits [18:16] contain the index of the minimum value
1517/// and the remaining bits are set to 0.
1518static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
1519 return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
1520}
1521
1522/* Handle the sse4.2 definitions here. */
1523
1524/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1525 so we'll do the same. */
1526
1527#undef __DEFAULT_FN_ATTRS
1528#define __DEFAULT_FN_ATTRS \
1529 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1530
1531/* These specify the type of data that we're comparing. */
1532#define _SIDD_UBYTE_OPS 0x00
1533#define _SIDD_UWORD_OPS 0x01
1534#define _SIDD_SBYTE_OPS 0x02
1535#define _SIDD_SWORD_OPS 0x03
1536
1537/* These specify the type of comparison operation. */
1538#define _SIDD_CMP_EQUAL_ANY 0x00
1539#define _SIDD_CMP_RANGES 0x04
1540#define _SIDD_CMP_EQUAL_EACH 0x08
1541#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1542
1543/* These macros specify the polarity of the operation. */
1544#define _SIDD_POSITIVE_POLARITY 0x00
1545#define _SIDD_NEGATIVE_POLARITY 0x10
1546#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1547#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1548
1549/* These macros are used in _mm_cmpXstri() to specify the return. */
1550#define _SIDD_LEAST_SIGNIFICANT 0x00
1551#define _SIDD_MOST_SIGNIFICANT 0x40
1552
1553/* These macros are used in _mm_cmpXstri() to specify the return. */
1554#define _SIDD_BIT_MASK 0x00
1555#define _SIDD_UNIT_MASK 0x40
1556
1557/* SSE4.2 Packed Comparison Intrinsics. */
1558/// Uses the immediate operand \a M to perform a comparison of string
1559/// data with implicitly defined lengths that is contained in source operands
1560/// \a A and \a B. Returns a 128-bit integer vector representing the result
1561/// mask of the comparison.
1562///
1563/// \headerfile <x86intrin.h>
1564///
1565/// \code
1566/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1567/// \endcode
1568///
1569/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1570/// instruction.
1571///
1572/// \param A
1573/// A 128-bit integer vector containing one of the source operands to be
1574/// compared.
1575/// \param B
1576/// A 128-bit integer vector containing one of the source operands to be
1577/// compared.
1578/// \param M
1579/// An 8-bit immediate operand specifying whether the characters are bytes or
1580/// words, the type of comparison to perform, and the format of the return
1581/// value. \n
1582/// Bits [1:0]: Determine source data format. \n
1583/// 00: 16 unsigned bytes \n
1584/// 01: 8 unsigned words \n
1585/// 10: 16 signed bytes \n
1586/// 11: 8 signed words \n
1587/// Bits [3:2]: Determine comparison type and aggregation method. \n
1588/// 00: Subset: Each character in \a B is compared for equality with all
1589/// the characters in \a A. \n
1590/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1591/// basis is greater than or equal for even-indexed elements in \a A,
1592/// and less than or equal for odd-indexed elements in \a A. \n
1593/// 10: Match: Compare each pair of corresponding characters in \a A and
1594/// \a B for equality. \n
1595/// 11: Substring: Search \a B for substring matches of \a A. \n
1596/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1597/// mask of the comparison results. \n
1598/// 00: No effect. \n
1599/// 01: Negate the bit mask. \n
1600/// 10: No effect. \n
1601/// 11: Negate the bit mask only for bits with an index less than or equal
1602/// to the size of \a A or \a B. \n
1603/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1604/// bytes. \n
1605/// 0: The result is zero-extended to 16 bytes. \n
1606/// 1: The result is expanded to 16 bytes (this expansion is performed by
1607/// repeating each bit 8 or 16 times).
1608/// \returns Returns a 128-bit integer vector representing the result mask of
1609/// the comparison.
1610#define _mm_cmpistrm(A, B, M) \
1611 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1612 (__v16qi)(__m128i)(B), (int)(M)))
1613
1614/// Uses the immediate operand \a M to perform a comparison of string
1615/// data with implicitly defined lengths that is contained in source operands
1616/// \a A and \a B. Returns an integer representing the result index of the
1617/// comparison.
1618///
1619/// \headerfile <x86intrin.h>
1620///
1621/// \code
1622/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1623/// \endcode
1624///
1625/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1626/// instruction.
1627///
1628/// \param A
1629/// A 128-bit integer vector containing one of the source operands to be
1630/// compared.
1631/// \param B
1632/// A 128-bit integer vector containing one of the source operands to be
1633/// compared.
1634/// \param M
1635/// An 8-bit immediate operand specifying whether the characters are bytes or
1636/// words, the type of comparison to perform, and the format of the return
1637/// value. \n
1638/// Bits [1:0]: Determine source data format. \n
1639/// 00: 16 unsigned bytes \n
1640/// 01: 8 unsigned words \n
1641/// 10: 16 signed bytes \n
1642/// 11: 8 signed words \n
1643/// Bits [3:2]: Determine comparison type and aggregation method. \n
1644/// 00: Subset: Each character in \a B is compared for equality with all
1645/// the characters in \a A. \n
1646/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1647/// basis is greater than or equal for even-indexed elements in \a A,
1648/// and less than or equal for odd-indexed elements in \a A. \n
1649/// 10: Match: Compare each pair of corresponding characters in \a A and
1650/// \a B for equality. \n
1651/// 11: Substring: Search B for substring matches of \a A. \n
1652/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1653/// mask of the comparison results. \n
1654/// 00: No effect. \n
1655/// 01: Negate the bit mask. \n
1656/// 10: No effect. \n
1657/// 11: Negate the bit mask only for bits with an index less than or equal
1658/// to the size of \a A or \a B. \n
1659/// Bit [6]: Determines whether the index of the lowest set bit or the
1660/// highest set bit is returned. \n
1661/// 0: The index of the least significant set bit. \n
1662/// 1: The index of the most significant set bit. \n
1663/// \returns Returns an integer representing the result index of the comparison.
1664#define _mm_cmpistri(A, B, M) \
1665 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1666 (__v16qi)(__m128i)(B), (int)(M)))
1667
1668/// Uses the immediate operand \a M to perform a comparison of string
1669/// data with explicitly defined lengths that is contained in source operands
1670/// \a A and \a B. Returns a 128-bit integer vector representing the result
1671/// mask of the comparison.
1672///
1673/// \headerfile <x86intrin.h>
1674///
1675/// \code
1676/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1677/// \endcode
1678///
1679/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1680/// instruction.
1681///
1682/// \param A
1683/// A 128-bit integer vector containing one of the source operands to be
1684/// compared.
1685/// \param LA
1686/// An integer that specifies the length of the string in \a A.
1687/// \param B
1688/// A 128-bit integer vector containing one of the source operands to be
1689/// compared.
1690/// \param LB
1691/// An integer that specifies the length of the string in \a B.
1692/// \param M
1693/// An 8-bit immediate operand specifying whether the characters are bytes or
1694/// words, the type of comparison to perform, and the format of the return
1695/// value. \n
1696/// Bits [1:0]: Determine source data format. \n
1697/// 00: 16 unsigned bytes \n
1698/// 01: 8 unsigned words \n
1699/// 10: 16 signed bytes \n
1700/// 11: 8 signed words \n
1701/// Bits [3:2]: Determine comparison type and aggregation method. \n
1702/// 00: Subset: Each character in \a B is compared for equality with all
1703/// the characters in \a A. \n
1704/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1705/// basis is greater than or equal for even-indexed elements in \a A,
1706/// and less than or equal for odd-indexed elements in \a A. \n
1707/// 10: Match: Compare each pair of corresponding characters in \a A and
1708/// \a B for equality. \n
1709/// 11: Substring: Search \a B for substring matches of \a A. \n
1710/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1711/// mask of the comparison results. \n
1712/// 00: No effect. \n
1713/// 01: Negate the bit mask. \n
1714/// 10: No effect. \n
1715/// 11: Negate the bit mask only for bits with an index less than or equal
1716/// to the size of \a A or \a B. \n
1717/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1718/// bytes. \n
1719/// 0: The result is zero-extended to 16 bytes. \n
1720/// 1: The result is expanded to 16 bytes (this expansion is performed by
1721/// repeating each bit 8 or 16 times). \n
1722/// \returns Returns a 128-bit integer vector representing the result mask of
1723/// the comparison.
1724#define _mm_cmpestrm(A, LA, B, LB, M) \
1725 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1726 (__v16qi)(__m128i)(B), (int)(LB), \
1727 (int)(M)))
1728
1729/// Uses the immediate operand \a M to perform a comparison of string
1730/// data with explicitly defined lengths that is contained in source operands
1731/// \a A and \a B. Returns an integer representing the result index of the
1732/// comparison.
1733///
1734/// \headerfile <x86intrin.h>
1735///
1736/// \code
1737/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1738/// \endcode
1739///
1740/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1741/// instruction.
1742///
1743/// \param A
1744/// A 128-bit integer vector containing one of the source operands to be
1745/// compared.
1746/// \param LA
1747/// An integer that specifies the length of the string in \a A.
1748/// \param B
1749/// A 128-bit integer vector containing one of the source operands to be
1750/// compared.
1751/// \param LB
1752/// An integer that specifies the length of the string in \a B.
1753/// \param M
1754/// An 8-bit immediate operand specifying whether the characters are bytes or
1755/// words, the type of comparison to perform, and the format of the return
1756/// value. \n
1757/// Bits [1:0]: Determine source data format. \n
1758/// 00: 16 unsigned bytes \n
1759/// 01: 8 unsigned words \n
1760/// 10: 16 signed bytes \n
1761/// 11: 8 signed words \n
1762/// Bits [3:2]: Determine comparison type and aggregation method. \n
1763/// 00: Subset: Each character in \a B is compared for equality with all
1764/// the characters in \a A. \n
1765/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1766/// basis is greater than or equal for even-indexed elements in \a A,
1767/// and less than or equal for odd-indexed elements in \a A. \n
1768/// 10: Match: Compare each pair of corresponding characters in \a A and
1769/// \a B for equality. \n
1770/// 11: Substring: Search B for substring matches of \a A. \n
1771/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1772/// mask of the comparison results. \n
1773/// 00: No effect. \n
1774/// 01: Negate the bit mask. \n
1775/// 10: No effect. \n
1776/// 11: Negate the bit mask only for bits with an index less than or equal
1777/// to the size of \a A or \a B. \n
1778/// Bit [6]: Determines whether the index of the lowest set bit or the
1779/// highest set bit is returned. \n
1780/// 0: The index of the least significant set bit. \n
1781/// 1: The index of the most significant set bit. \n
1782/// \returns Returns an integer representing the result index of the comparison.
1783#define _mm_cmpestri(A, LA, B, LB, M) \
1784 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1785 (__v16qi)(__m128i)(B), (int)(LB), \
1786 (int)(M)))
1787
1788/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
1789/// Uses the immediate operand \a M to perform a comparison of string
1790/// data with implicitly defined lengths that is contained in source operands
1791/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1792/// string in \a B is the maximum, otherwise, returns 0.
1793///
1794/// \headerfile <x86intrin.h>
1795///
1796/// \code
1797/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1798/// \endcode
1799///
1800/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1801/// instruction.
1802///
1803/// \param A
1804/// A 128-bit integer vector containing one of the source operands to be
1805/// compared.
1806/// \param B
1807/// A 128-bit integer vector containing one of the source operands to be
1808/// compared.
1809/// \param M
1810/// An 8-bit immediate operand specifying whether the characters are bytes or
1811/// words and the type of comparison to perform. \n
1812/// Bits [1:0]: Determine source data format. \n
1813/// 00: 16 unsigned bytes \n
1814/// 01: 8 unsigned words \n
1815/// 10: 16 signed bytes \n
1816/// 11: 8 signed words \n
1817/// Bits [3:2]: Determine comparison type and aggregation method. \n
1818/// 00: Subset: Each character in \a B is compared for equality with all
1819/// the characters in \a A. \n
1820/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1821/// basis is greater than or equal for even-indexed elements in \a A,
1822/// and less than or equal for odd-indexed elements in \a A. \n
1823/// 10: Match: Compare each pair of corresponding characters in \a A and
1824/// \a B for equality. \n
1825/// 11: Substring: Search \a B for substring matches of \a A. \n
1826/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1827/// mask of the comparison results. \n
1828/// 00: No effect. \n
1829/// 01: Negate the bit mask. \n
1830/// 10: No effect. \n
1831/// 11: Negate the bit mask only for bits with an index less than or equal
1832/// to the size of \a A or \a B. \n
1833/// \returns Returns 1 if the bit mask is zero and the length of the string in
1834/// \a B is the maximum; otherwise, returns 0.
1835#define _mm_cmpistra(A, B, M) \
1836 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1837 (__v16qi)(__m128i)(B), (int)(M)))
1838
1839/// Uses the immediate operand \a M to perform a comparison of string
1840/// data with implicitly defined lengths that is contained in source operands
1841/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1842/// 0.
1843///
1844/// \headerfile <x86intrin.h>
1845///
1846/// \code
1847/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1848/// \endcode
1849///
1850/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1851/// instruction.
1852///
1853/// \param A
1854/// A 128-bit integer vector containing one of the source operands to be
1855/// compared.
1856/// \param B
1857/// A 128-bit integer vector containing one of the source operands to be
1858/// compared.
1859/// \param M
1860/// An 8-bit immediate operand specifying whether the characters are bytes or
1861/// words and the type of comparison to perform. \n
1862/// Bits [1:0]: Determine source data format. \n
1863/// 00: 16 unsigned bytes \n
1864/// 01: 8 unsigned words \n
1865/// 10: 16 signed bytes \n
1866/// 11: 8 signed words \n
1867/// Bits [3:2]: Determine comparison type and aggregation method. \n
1868/// 00: Subset: Each character in \a B is compared for equality with all
1869/// the characters in \a A. \n
1870/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1871/// basis is greater than or equal for even-indexed elements in \a A,
1872/// and less than or equal for odd-indexed elements in \a A. \n
1873/// 10: Match: Compare each pair of corresponding characters in \a A and
1874/// \a B for equality. \n
1875/// 11: Substring: Search B for substring matches of \a A. \n
1876/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1877/// mask of the comparison results. \n
1878/// 00: No effect. \n
1879/// 01: Negate the bit mask. \n
1880/// 10: No effect. \n
1881/// 11: Negate the bit mask only for bits with an index less than or equal
1882/// to the size of \a A or \a B.
1883/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1884#define _mm_cmpistrc(A, B, M) \
1885 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1886 (__v16qi)(__m128i)(B), (int)(M)))
1887
1888/// Uses the immediate operand \a M to perform a comparison of string
1889/// data with implicitly defined lengths that is contained in source operands
1890/// \a A and \a B. Returns bit 0 of the resulting bit mask.
1891///
1892/// \headerfile <x86intrin.h>
1893///
1894/// \code
1895/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1896/// \endcode
1897///
1898/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1899/// instruction.
1900///
1901/// \param A
1902/// A 128-bit integer vector containing one of the source operands to be
1903/// compared.
1904/// \param B
1905/// A 128-bit integer vector containing one of the source operands to be
1906/// compared.
1907/// \param M
1908/// An 8-bit immediate operand specifying whether the characters are bytes or
1909/// words and the type of comparison to perform. \n
1910/// Bits [1:0]: Determine source data format. \n
1911/// 00: 16 unsigned bytes \n
1912/// 01: 8 unsigned words \n
1913/// 10: 16 signed bytes \n
1914/// 11: 8 signed words \n
1915/// Bits [3:2]: Determine comparison type and aggregation method. \n
1916/// 00: Subset: Each character in \a B is compared for equality with all
1917/// the characters in \a A. \n
1918/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1919/// basis is greater than or equal for even-indexed elements in \a A,
1920/// and less than or equal for odd-indexed elements in \a A. \n
1921/// 10: Match: Compare each pair of corresponding characters in \a A and
1922/// \a B for equality. \n
1923/// 11: Substring: Search B for substring matches of \a A. \n
1924/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1925/// mask of the comparison results. \n
1926/// 00: No effect. \n
1927/// 01: Negate the bit mask. \n
1928/// 10: No effect. \n
1929/// 11: Negate the bit mask only for bits with an index less than or equal
1930/// to the size of \a A or \a B. \n
1931/// \returns Returns bit 0 of the resulting bit mask.
1932#define _mm_cmpistro(A, B, M) \
1933 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1934 (__v16qi)(__m128i)(B), (int)(M)))
1935
1936/// Uses the immediate operand \a M to perform a comparison of string
1937/// data with implicitly defined lengths that is contained in source operands
1938/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
1939/// the maximum, otherwise, returns 0.
1940///
1941/// \headerfile <x86intrin.h>
1942///
1943/// \code
1944/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1945/// \endcode
1946///
1947/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1948/// instruction.
1949///
1950/// \param A
1951/// A 128-bit integer vector containing one of the source operands to be
1952/// compared.
1953/// \param B
1954/// A 128-bit integer vector containing one of the source operands to be
1955/// compared.
1956/// \param M
1957/// An 8-bit immediate operand specifying whether the characters are bytes or
1958/// words and the type of comparison to perform. \n
1959/// Bits [1:0]: Determine source data format. \n
1960/// 00: 16 unsigned bytes \n
1961/// 01: 8 unsigned words \n
1962/// 10: 16 signed bytes \n
1963/// 11: 8 signed words \n
1964/// Bits [3:2]: Determine comparison type and aggregation method. \n
1965/// 00: Subset: Each character in \a B is compared for equality with all
1966/// the characters in \a A. \n
1967/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1968/// basis is greater than or equal for even-indexed elements in \a A,
1969/// and less than or equal for odd-indexed elements in \a A. \n
1970/// 10: Match: Compare each pair of corresponding characters in \a A and
1971/// \a B for equality. \n
1972/// 11: Substring: Search \a B for substring matches of \a A. \n
1973/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1974/// mask of the comparison results. \n
1975/// 00: No effect. \n
1976/// 01: Negate the bit mask. \n
1977/// 10: No effect. \n
1978/// 11: Negate the bit mask only for bits with an index less than or equal
1979/// to the size of \a A or \a B. \n
1980/// \returns Returns 1 if the length of the string in \a A is less than the
1981/// maximum, otherwise, returns 0.
1982#define _mm_cmpistrs(A, B, M) \
1983 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
1984 (__v16qi)(__m128i)(B), (int)(M)))
1985
1986/// Uses the immediate operand \a M to perform a comparison of string
1987/// data with implicitly defined lengths that is contained in source operands
1988/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
1989/// the maximum, otherwise, returns 0.
1990///
1991/// \headerfile <x86intrin.h>
1992///
1993/// \code
1994/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
1995/// \endcode
1996///
1997/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1998/// instruction.
1999///
2000/// \param A
2001/// A 128-bit integer vector containing one of the source operands to be
2002/// compared.
2003/// \param B
2004/// A 128-bit integer vector containing one of the source operands to be
2005/// compared.
2006/// \param M
2007/// An 8-bit immediate operand specifying whether the characters are bytes or
2008/// words and the type of comparison to perform. \n
2009/// Bits [1:0]: Determine source data format. \n
2010/// 00: 16 unsigned bytes \n
2011/// 01: 8 unsigned words \n
2012/// 10: 16 signed bytes \n
2013/// 11: 8 signed words \n
2014/// Bits [3:2]: Determine comparison type and aggregation method. \n
2015/// 00: Subset: Each character in \a B is compared for equality with all
2016/// the characters in \a A. \n
2017/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2018/// basis is greater than or equal for even-indexed elements in \a A,
2019/// and less than or equal for odd-indexed elements in \a A. \n
2020/// 10: Match: Compare each pair of corresponding characters in \a A and
2021/// \a B for equality. \n
2022/// 11: Substring: Search \a B for substring matches of \a A. \n
2023/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2024/// mask of the comparison results. \n
2025/// 00: No effect. \n
2026/// 01: Negate the bit mask. \n
2027/// 10: No effect. \n
2028/// 11: Negate the bit mask only for bits with an index less than or equal
2029/// to the size of \a A or \a B.
2030/// \returns Returns 1 if the length of the string in \a B is less than the
2031/// maximum, otherwise, returns 0.
2032#define _mm_cmpistrz(A, B, M) \
2033 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2034 (__v16qi)(__m128i)(B), (int)(M)))
2035
2036/// Uses the immediate operand \a M to perform a comparison of string
2037/// data with explicitly defined lengths that is contained in source operands
2038/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2039/// string in \a B is the maximum, otherwise, returns 0.
2040///
2041/// \headerfile <x86intrin.h>
2042///
2043/// \code
2044/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2045/// \endcode
2046///
2047/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2048/// instruction.
2049///
2050/// \param A
2051/// A 128-bit integer vector containing one of the source operands to be
2052/// compared.
2053/// \param LA
2054/// An integer that specifies the length of the string in \a A.
2055/// \param B
2056/// A 128-bit integer vector containing one of the source operands to be
2057/// compared.
2058/// \param LB
2059/// An integer that specifies the length of the string in \a B.
2060/// \param M
2061/// An 8-bit immediate operand specifying whether the characters are bytes or
2062/// words and the type of comparison to perform. \n
2063/// Bits [1:0]: Determine source data format. \n
2064/// 00: 16 unsigned bytes \n
2065/// 01: 8 unsigned words \n
2066/// 10: 16 signed bytes \n
2067/// 11: 8 signed words \n
2068/// Bits [3:2]: Determine comparison type and aggregation method. \n
2069/// 00: Subset: Each character in \a B is compared for equality with all
2070/// the characters in \a A. \n
2071/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2072/// basis is greater than or equal for even-indexed elements in \a A,
2073/// and less than or equal for odd-indexed elements in \a A. \n
2074/// 10: Match: Compare each pair of corresponding characters in \a A and
2075/// \a B for equality. \n
2076/// 11: Substring: Search \a B for substring matches of \a A. \n
2077/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2078/// mask of the comparison results. \n
2079/// 00: No effect. \n
2080/// 01: Negate the bit mask. \n
2081/// 10: No effect. \n
2082/// 11: Negate the bit mask only for bits with an index less than or equal
2083/// to the size of \a A or \a B.
2084/// \returns Returns 1 if the bit mask is zero and the length of the string in
2085/// \a B is the maximum, otherwise, returns 0.
2086#define _mm_cmpestra(A, LA, B, LB, M) \
2087 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2088 (__v16qi)(__m128i)(B), (int)(LB), \
2089 (int)(M)))
2090
2091/// Uses the immediate operand \a M to perform a comparison of string
2092/// data with explicitly defined lengths that is contained in source operands
2093/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2094/// returns 0.
2095///
2096/// \headerfile <x86intrin.h>
2097///
2098/// \code
2099/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2100/// \endcode
2101///
2102/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2103/// instruction.
2104///
2105/// \param A
2106/// A 128-bit integer vector containing one of the source operands to be
2107/// compared.
2108/// \param LA
2109/// An integer that specifies the length of the string in \a A.
2110/// \param B
2111/// A 128-bit integer vector containing one of the source operands to be
2112/// compared.
2113/// \param LB
2114/// An integer that specifies the length of the string in \a B.
2115/// \param M
2116/// An 8-bit immediate operand specifying whether the characters are bytes or
2117/// words and the type of comparison to perform. \n
2118/// Bits [1:0]: Determine source data format. \n
2119/// 00: 16 unsigned bytes \n
2120/// 01: 8 unsigned words \n
2121/// 10: 16 signed bytes \n
2122/// 11: 8 signed words \n
2123/// Bits [3:2]: Determine comparison type and aggregation method. \n
2124/// 00: Subset: Each character in \a B is compared for equality with all
2125/// the characters in \a A. \n
2126/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2127/// basis is greater than or equal for even-indexed elements in \a A,
2128/// and less than or equal for odd-indexed elements in \a A. \n
2129/// 10: Match: Compare each pair of corresponding characters in \a A and
2130/// \a B for equality. \n
2131/// 11: Substring: Search \a B for substring matches of \a A. \n
2132/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2133/// mask of the comparison results. \n
2134/// 00: No effect. \n
2135/// 01: Negate the bit mask. \n
2136/// 10: No effect. \n
2137/// 11: Negate the bit mask only for bits with an index less than or equal
2138/// to the size of \a A or \a B. \n
2139/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2140#define _mm_cmpestrc(A, LA, B, LB, M) \
2141 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2142 (__v16qi)(__m128i)(B), (int)(LB), \
2143 (int)(M)))
2144
2145/// Uses the immediate operand \a M to perform a comparison of string
2146/// data with explicitly defined lengths that is contained in source operands
2147/// \a A and \a B. Returns bit 0 of the resulting bit mask.
2148///
2149/// \headerfile <x86intrin.h>
2150///
2151/// \code
2152/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2153/// \endcode
2154///
2155/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2156/// instruction.
2157///
2158/// \param A
2159/// A 128-bit integer vector containing one of the source operands to be
2160/// compared.
2161/// \param LA
2162/// An integer that specifies the length of the string in \a A.
2163/// \param B
2164/// A 128-bit integer vector containing one of the source operands to be
2165/// compared.
2166/// \param LB
2167/// An integer that specifies the length of the string in \a B.
2168/// \param M
2169/// An 8-bit immediate operand specifying whether the characters are bytes or
2170/// words and the type of comparison to perform. \n
2171/// Bits [1:0]: Determine source data format. \n
2172/// 00: 16 unsigned bytes \n
2173/// 01: 8 unsigned words \n
2174/// 10: 16 signed bytes \n
2175/// 11: 8 signed words \n
2176/// Bits [3:2]: Determine comparison type and aggregation method. \n
2177/// 00: Subset: Each character in \a B is compared for equality with all
2178/// the characters in \a A. \n
2179/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2180/// basis is greater than or equal for even-indexed elements in \a A,
2181/// and less than or equal for odd-indexed elements in \a A. \n
2182/// 10: Match: Compare each pair of corresponding characters in \a A and
2183/// \a B for equality. \n
2184/// 11: Substring: Search \a B for substring matches of \a A. \n
2185/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2186/// mask of the comparison results. \n
2187/// 00: No effect. \n
2188/// 01: Negate the bit mask. \n
2189/// 10: No effect. \n
2190/// 11: Negate the bit mask only for bits with an index less than or equal
2191/// to the size of \a A or \a B.
2192/// \returns Returns bit 0 of the resulting bit mask.
2193#define _mm_cmpestro(A, LA, B, LB, M) \
2194 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2195 (__v16qi)(__m128i)(B), (int)(LB), \
2196 (int)(M)))
2197
2198/// Uses the immediate operand \a M to perform a comparison of string
2199/// data with explicitly defined lengths that is contained in source operands
2200/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2201/// the maximum, otherwise, returns 0.
2202///
2203/// \headerfile <x86intrin.h>
2204///
2205/// \code
2206/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2207/// \endcode
2208///
2209/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2210/// instruction.
2211///
2212/// \param A
2213/// A 128-bit integer vector containing one of the source operands to be
2214/// compared.
2215/// \param LA
2216/// An integer that specifies the length of the string in \a A.
2217/// \param B
2218/// A 128-bit integer vector containing one of the source operands to be
2219/// compared.
2220/// \param LB
2221/// An integer that specifies the length of the string in \a B.
2222/// \param M
2223/// An 8-bit immediate operand specifying whether the characters are bytes or
2224/// words and the type of comparison to perform. \n
2225/// Bits [1:0]: Determine source data format. \n
2226/// 00: 16 unsigned bytes \n
2227/// 01: 8 unsigned words \n
2228/// 10: 16 signed bytes \n
2229/// 11: 8 signed words \n
2230/// Bits [3:2]: Determine comparison type and aggregation method. \n
2231/// 00: Subset: Each character in \a B is compared for equality with all
2232/// the characters in \a A. \n
2233/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2234/// basis is greater than or equal for even-indexed elements in \a A,
2235/// and less than or equal for odd-indexed elements in \a A. \n
2236/// 10: Match: Compare each pair of corresponding characters in \a A and
2237/// \a B for equality. \n
2238/// 11: Substring: Search \a B for substring matches of \a A. \n
2239/// Bits [5:4]: Determine whether to perform a one's complement in the bit
2240/// mask of the comparison results. \n
2241/// 00: No effect. \n
2242/// 01: Negate the bit mask. \n
2243/// 10: No effect. \n
2244/// 11: Negate the bit mask only for bits with an index less than or equal
2245/// to the size of \a A or \a B. \n
2246/// \returns Returns 1 if the length of the string in \a A is less than the
2247/// maximum, otherwise, returns 0.
2248#define _mm_cmpestrs(A, LA, B, LB, M) \
2249 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2250 (__v16qi)(__m128i)(B), (int)(LB), \
2251 (int)(M)))
2252
2253/// Uses the immediate operand \a M to perform a comparison of string
2254/// data with explicitly defined lengths that is contained in source operands
2255/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2256/// the maximum, otherwise, returns 0.
2257///
2258/// \headerfile <x86intrin.h>
2259///
2260/// \code
2261/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2262/// \endcode
2263///
2264/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2265///
2266/// \param A
2267/// A 128-bit integer vector containing one of the source operands to be
2268/// compared.
2269/// \param LA
2270/// An integer that specifies the length of the string in \a A.
2271/// \param B
2272/// A 128-bit integer vector containing one of the source operands to be
2273/// compared.
2274/// \param LB
2275/// An integer that specifies the length of the string in \a B.
2276/// \param M
2277/// An 8-bit immediate operand specifying whether the characters are bytes or
2278/// words and the type of comparison to perform. \n
2279/// Bits [1:0]: Determine source data format. \n
2280/// 00: 16 unsigned bytes \n
2281/// 01: 8 unsigned words \n
2282/// 10: 16 signed bytes \n
2283/// 11: 8 signed words \n
2284/// Bits [3:2]: Determine comparison type and aggregation method. \n
2285/// 00: Subset: Each character in \a B is compared for equality with all
2286/// the characters in \a A. \n
2287/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2288/// basis is greater than or equal for even-indexed elements in \a A,
2289/// and less than or equal for odd-indexed elements in \a A. \n
2290/// 10: Match: Compare each pair of corresponding characters in \a A and
2291/// \a B for equality. \n
2292/// 11: Substring: Search \a B for substring matches of \a A. \n
2293/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2294/// mask of the comparison results. \n
2295/// 00: No effect. \n
2296/// 01: Negate the bit mask. \n
2297/// 10: No effect. \n
2298/// 11: Negate the bit mask only for bits with an index less than or equal
2299/// to the size of \a A or \a B.
2300/// \returns Returns 1 if the length of the string in \a B is less than the
2301/// maximum, otherwise, returns 0.
2302#define _mm_cmpestrz(A, LA, B, LB, M) \
2303 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2304 (__v16qi)(__m128i)(B), (int)(LB), \
2305 (int)(M)))
2306
2307/* SSE4.2 Compare Packed Data -- Greater Than. */
2308/// Compares each of the corresponding 64-bit values of the 128-bit
2309/// integer vectors to determine if the values in the first operand are
2310/// greater than those in the second operand.
2311///
2312/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
2313///
2314/// \headerfile <x86intrin.h>
2315///
2316/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2317///
2318/// \param __V1
2319/// A 128-bit integer vector.
2320/// \param __V2
2321/// A 128-bit integer vector.
2322/// \returns A 128-bit integer vector containing the comparison results.
2323static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
2324 __m128i __V2) {
2325 return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2326}
2327
2328#undef __DEFAULT_FN_ATTRS
2329
2330#include <popcntintrin.h>
2331
2332#include <crc32intrin.h>
2333
2334#endif /* __SMMINTRIN_H */
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition: smmintrin.h:442
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:708
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V)
Zero-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1364
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V)
Zero-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1344
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, __m128i __V2)
Multiples corresponding elements of two 128-bit vectors of [4 x i32] and returns the lower 32 bits of...
Definition: smmintrin.h:545
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_stream_load_si128(const void *__V)
Loads integer values from a 128-bit aligned memory location to a 128-bit integer vector.
Definition: smmintrin.h:654
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:672
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:798
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V)
Sign-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1249
#define __DEFAULT_FN_ATTRS
Definition: smmintrin.h:1528
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition: smmintrin.h:469
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V)
Zero-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1436
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V)
Zero-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1418
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors to determine if the v...
Definition: smmintrin.h:2323
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, __m128i __V2)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: smmintrin.h:1460
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:744
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V)
Sign-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1307
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V)
Sign-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1269
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:690
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:726
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors for equality.
Definition: smmintrin.h:1208
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are neither all zeros nor all ones.
Definition: smmintrin.h:1134
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V)
Sign-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1289
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V)
Zero-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1400
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:762
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all ones.
Definition: smmintrin.h:1116
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V)
Zero-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1382
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:780
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V)
Sign-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1227
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, __m128i __V2)
Multiplies corresponding even-indexed elements of two 128-bit vectors of [4 x i32] and returns a 128-...
Definition: smmintrin.h:564
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V)
Finds the minimum unsigned 16-bit element in the input 128-bit vector of [8 x u16] and returns it and...
Definition: smmintrin.h:1518
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V)
Sign-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1325
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all zeros.
Definition: smmintrin.h:1099
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M)
Returns a 128-bit vector of [16 x i8] where the values are selected from either of the first or secon...
Definition: smmintrin.h:496