clang 20.0.0git
xmmintrin.h
Go to the documentation of this file.
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __XMMINTRIN_H
11#define __XMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <mmintrin.h>
18
19typedef int __v4si __attribute__((__vector_size__(16)));
20typedef float __v4sf __attribute__((__vector_size__(16)));
21typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22
23typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24
25/* Unsigned types */
26typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27
28/* This header should only be included in a hosted environment as it depends on
29 * a standard library to provide allocation routines. */
30#if __STDC_HOSTED__
31#include <mm_malloc.h>
32#endif
33
34/* Define the default attributes for the functions in this file. */
35#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
36#define __DEFAULT_FN_ATTRS \
37 __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
38 __min_vector_width__(128)))
39#define __DEFAULT_FN_ATTRS_SSE2 \
40 __attribute__((__always_inline__, __nodebug__, \
41 __target__("sse2,no-evex512"), __min_vector_width__(128)))
42#else
43#define __DEFAULT_FN_ATTRS \
44 __attribute__((__always_inline__, __nodebug__, __target__("sse"), \
45 __min_vector_width__(128)))
46#define __DEFAULT_FN_ATTRS_SSE2 \
47 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
48 __min_vector_width__(128)))
49#endif
50
51#define __trunc64(x) \
52 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
53#define __zext128(x) \
54 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
55 1, 2, 3)
56#define __anyext128(x) \
57 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
58 1, -1, -1)
59#define __zeroupper64(x) \
60 (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0, \
61 1, 4, 5)
62
63/// Adds the 32-bit float values in the low-order bits of the operands.
64///
65/// \headerfile <x86intrin.h>
66///
67/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
68///
69/// \param __a
70/// A 128-bit vector of [4 x float] containing one of the source operands.
71/// The lower 32 bits of this operand are used in the calculation.
72/// \param __b
73/// A 128-bit vector of [4 x float] containing one of the source operands.
74/// The lower 32 bits of this operand are used in the calculation.
75/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
76/// of the lower 32 bits of both operands. The upper 96 bits are copied from
77/// the upper 96 bits of the first source operand.
78static __inline__ __m128 __DEFAULT_FN_ATTRS
79_mm_add_ss(__m128 __a, __m128 __b)
80{
81 __a[0] += __b[0];
82 return __a;
83}
84
85/// Adds two 128-bit vectors of [4 x float], and returns the results of
86/// the addition.
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
91///
92/// \param __a
93/// A 128-bit vector of [4 x float] containing one of the source operands.
94/// \param __b
95/// A 128-bit vector of [4 x float] containing one of the source operands.
96/// \returns A 128-bit vector of [4 x float] containing the sums of both
97/// operands.
98static __inline__ __m128 __DEFAULT_FN_ATTRS
99_mm_add_ps(__m128 __a, __m128 __b)
100{
101 return (__m128)((__v4sf)__a + (__v4sf)__b);
102}
103
104/// Subtracts the 32-bit float value in the low-order bits of the second
105/// operand from the corresponding value in the first operand.
106///
107/// \headerfile <x86intrin.h>
108///
109/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
110///
111/// \param __a
112/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
113/// of this operand are used in the calculation.
114/// \param __b
115/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
116/// bits of this operand are used in the calculation.
117/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
118/// difference of the lower 32 bits of both operands. The upper 96 bits are
119/// copied from the upper 96 bits of the first source operand.
120static __inline__ __m128 __DEFAULT_FN_ATTRS
121_mm_sub_ss(__m128 __a, __m128 __b)
122{
123 __a[0] -= __b[0];
124 return __a;
125}
126
127/// Subtracts each of the values of the second operand from the first
128/// operand, both of which are 128-bit vectors of [4 x float] and returns
129/// the results of the subtraction.
130///
131/// \headerfile <x86intrin.h>
132///
133/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
134///
135/// \param __a
136/// A 128-bit vector of [4 x float] containing the minuend.
137/// \param __b
138/// A 128-bit vector of [4 x float] containing the subtrahend.
139/// \returns A 128-bit vector of [4 x float] containing the differences between
140/// both operands.
141static __inline__ __m128 __DEFAULT_FN_ATTRS
142_mm_sub_ps(__m128 __a, __m128 __b)
143{
144 return (__m128)((__v4sf)__a - (__v4sf)__b);
145}
146
147/// Multiplies two 32-bit float values in the low-order bits of the
148/// operands.
149///
150/// \headerfile <x86intrin.h>
151///
152/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
153///
154/// \param __a
155/// A 128-bit vector of [4 x float] containing one of the source operands.
156/// The lower 32 bits of this operand are used in the calculation.
157/// \param __b
158/// A 128-bit vector of [4 x float] containing one of the source operands.
159/// The lower 32 bits of this operand are used in the calculation.
160/// \returns A 128-bit vector of [4 x float] containing the product of the lower
161/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
162/// bits of the first source operand.
163static __inline__ __m128 __DEFAULT_FN_ATTRS
164_mm_mul_ss(__m128 __a, __m128 __b)
165{
166 __a[0] *= __b[0];
167 return __a;
168}
169
170/// Multiplies two 128-bit vectors of [4 x float] and returns the
171/// results of the multiplication.
172///
173/// \headerfile <x86intrin.h>
174///
175/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
176///
177/// \param __a
178/// A 128-bit vector of [4 x float] containing one of the source operands.
179/// \param __b
180/// A 128-bit vector of [4 x float] containing one of the source operands.
181/// \returns A 128-bit vector of [4 x float] containing the products of both
182/// operands.
183static __inline__ __m128 __DEFAULT_FN_ATTRS
184_mm_mul_ps(__m128 __a, __m128 __b)
185{
186 return (__m128)((__v4sf)__a * (__v4sf)__b);
187}
188
189/// Divides the value in the low-order 32 bits of the first operand by
190/// the corresponding value in the second operand.
191///
192/// \headerfile <x86intrin.h>
193///
194/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
195///
196/// \param __a
197/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
198/// bits of this operand are used in the calculation.
199/// \param __b
200/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
201/// of this operand are used in the calculation.
202/// \returns A 128-bit vector of [4 x float] containing the quotients of the
203/// lower 32 bits of both operands. The upper 96 bits are copied from the
204/// upper 96 bits of the first source operand.
205static __inline__ __m128 __DEFAULT_FN_ATTRS
206_mm_div_ss(__m128 __a, __m128 __b)
207{
208 __a[0] /= __b[0];
209 return __a;
210}
211
212/// Divides two 128-bit vectors of [4 x float].
213///
214/// \headerfile <x86intrin.h>
215///
216/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
217///
218/// \param __a
219/// A 128-bit vector of [4 x float] containing the dividend.
220/// \param __b
221/// A 128-bit vector of [4 x float] containing the divisor.
222/// \returns A 128-bit vector of [4 x float] containing the quotients of both
223/// operands.
224static __inline__ __m128 __DEFAULT_FN_ATTRS
225_mm_div_ps(__m128 __a, __m128 __b)
226{
227 return (__m128)((__v4sf)__a / (__v4sf)__b);
228}
229
230/// Calculates the square root of the value stored in the low-order bits
231/// of a 128-bit vector of [4 x float].
232///
233/// \headerfile <x86intrin.h>
234///
235/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
236///
237/// \param __a
238/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
239/// used in the calculation.
240/// \returns A 128-bit vector of [4 x float] containing the square root of the
241/// value in the low-order bits of the operand.
242static __inline__ __m128 __DEFAULT_FN_ATTRS
244{
245 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
246}
247
248/// Calculates the square roots of the values stored in a 128-bit vector
249/// of [4 x float].
250///
251/// \headerfile <x86intrin.h>
252///
253/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
254///
255/// \param __a
256/// A 128-bit vector of [4 x float].
257/// \returns A 128-bit vector of [4 x float] containing the square roots of the
258/// values in the operand.
259static __inline__ __m128 __DEFAULT_FN_ATTRS
261{
262 return __builtin_ia32_sqrtps((__v4sf)__a);
263}
264
265/// Calculates the approximate reciprocal of the value stored in the
266/// low-order bits of a 128-bit vector of [4 x float].
267///
268/// \headerfile <x86intrin.h>
269///
270/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
271///
272/// \param __a
273/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
274/// used in the calculation.
275/// \returns A 128-bit vector of [4 x float] containing the approximate
276/// reciprocal of the value in the low-order bits of the operand.
277static __inline__ __m128 __DEFAULT_FN_ATTRS
279{
280 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
281}
282
283/// Calculates the approximate reciprocals of the values stored in a
284/// 128-bit vector of [4 x float].
285///
286/// \headerfile <x86intrin.h>
287///
288/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
289///
290/// \param __a
291/// A 128-bit vector of [4 x float].
292/// \returns A 128-bit vector of [4 x float] containing the approximate
293/// reciprocals of the values in the operand.
294static __inline__ __m128 __DEFAULT_FN_ATTRS
296{
297 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
298}
299
300/// Calculates the approximate reciprocal of the square root of the value
301/// stored in the low-order bits of a 128-bit vector of [4 x float].
302///
303/// \headerfile <x86intrin.h>
304///
305/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
306///
307/// \param __a
308/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
309/// used in the calculation.
310/// \returns A 128-bit vector of [4 x float] containing the approximate
311/// reciprocal of the square root of the value in the low-order bits of the
312/// operand.
313static __inline__ __m128 __DEFAULT_FN_ATTRS
315{
316 return __builtin_ia32_rsqrtss((__v4sf)__a);
317}
318
319/// Calculates the approximate reciprocals of the square roots of the
320/// values stored in a 128-bit vector of [4 x float].
321///
322/// \headerfile <x86intrin.h>
323///
324/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
325///
326/// \param __a
327/// A 128-bit vector of [4 x float].
328/// \returns A 128-bit vector of [4 x float] containing the approximate
329/// reciprocals of the square roots of the values in the operand.
330static __inline__ __m128 __DEFAULT_FN_ATTRS
332{
333 return __builtin_ia32_rsqrtps((__v4sf)__a);
334}
335
336/// Compares two 32-bit float values in the low-order bits of both
337/// operands and returns the lesser value in the low-order bits of the
338/// vector of [4 x float].
339///
340/// If either value in a comparison is NaN, returns the value from \a __b.
341///
342/// \headerfile <x86intrin.h>
343///
344/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
345///
346/// \param __a
347/// A 128-bit vector of [4 x float] containing one of the operands. The lower
348/// 32 bits of this operand are used in the comparison.
349/// \param __b
350/// A 128-bit vector of [4 x float] containing one of the operands. The lower
351/// 32 bits of this operand are used in the comparison.
352/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
353/// minimum value between both operands. The upper 96 bits are copied from
354/// the upper 96 bits of the first source operand.
355static __inline__ __m128 __DEFAULT_FN_ATTRS
356_mm_min_ss(__m128 __a, __m128 __b)
357{
358 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
359}
360
361/// Compares two 128-bit vectors of [4 x float] and returns the lesser
362/// of each pair of values.
363///
364/// If either value in a comparison is NaN, returns the value from \a __b.
365///
366/// \headerfile <x86intrin.h>
367///
368/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
369///
370/// \param __a
371/// A 128-bit vector of [4 x float] containing one of the operands.
372/// \param __b
373/// A 128-bit vector of [4 x float] containing one of the operands.
374/// \returns A 128-bit vector of [4 x float] containing the minimum values
375/// between both operands.
376static __inline__ __m128 __DEFAULT_FN_ATTRS
377_mm_min_ps(__m128 __a, __m128 __b)
378{
379 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
380}
381
382/// Compares two 32-bit float values in the low-order bits of both
383/// operands and returns the greater value in the low-order bits of a 128-bit
384/// vector of [4 x float].
385///
386/// If either value in a comparison is NaN, returns the value from \a __b.
387///
388/// \headerfile <x86intrin.h>
389///
390/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
391///
392/// \param __a
393/// A 128-bit vector of [4 x float] containing one of the operands. The lower
394/// 32 bits of this operand are used in the comparison.
395/// \param __b
396/// A 128-bit vector of [4 x float] containing one of the operands. The lower
397/// 32 bits of this operand are used in the comparison.
398/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
399/// maximum value between both operands. The upper 96 bits are copied from
400/// the upper 96 bits of the first source operand.
401static __inline__ __m128 __DEFAULT_FN_ATTRS
402_mm_max_ss(__m128 __a, __m128 __b)
403{
404 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
405}
406
407/// Compares two 128-bit vectors of [4 x float] and returns the greater
408/// of each pair of values.
409///
410/// If either value in a comparison is NaN, returns the value from \a __b.
411///
412/// \headerfile <x86intrin.h>
413///
414/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
415///
416/// \param __a
417/// A 128-bit vector of [4 x float] containing one of the operands.
418/// \param __b
419/// A 128-bit vector of [4 x float] containing one of the operands.
420/// \returns A 128-bit vector of [4 x float] containing the maximum values
421/// between both operands.
422static __inline__ __m128 __DEFAULT_FN_ATTRS
423_mm_max_ps(__m128 __a, __m128 __b)
424{
425 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
426}
427
428/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
429///
430/// \headerfile <x86intrin.h>
431///
432/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
433///
434/// \param __a
435/// A 128-bit vector containing one of the source operands.
436/// \param __b
437/// A 128-bit vector containing one of the source operands.
438/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
439/// values between both operands.
440static __inline__ __m128 __DEFAULT_FN_ATTRS
441_mm_and_ps(__m128 __a, __m128 __b)
442{
443 return (__m128)((__v4su)__a & (__v4su)__b);
444}
445
446/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
447/// the one's complement of the values contained in the first source
448/// operand.
449///
450/// \headerfile <x86intrin.h>
451///
452/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
453///
454/// \param __a
455/// A 128-bit vector of [4 x float] containing the first source operand. The
456/// one's complement of this value is used in the bitwise AND.
457/// \param __b
458/// A 128-bit vector of [4 x float] containing the second source operand.
459/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
460/// one's complement of the first operand and the values in the second
461/// operand.
462static __inline__ __m128 __DEFAULT_FN_ATTRS
463_mm_andnot_ps(__m128 __a, __m128 __b)
464{
465 return (__m128)(~(__v4su)__a & (__v4su)__b);
466}
467
468/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
469///
470/// \headerfile <x86intrin.h>
471///
472/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
473///
474/// \param __a
475/// A 128-bit vector of [4 x float] containing one of the source operands.
476/// \param __b
477/// A 128-bit vector of [4 x float] containing one of the source operands.
478/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
479/// values between both operands.
480static __inline__ __m128 __DEFAULT_FN_ATTRS
481_mm_or_ps(__m128 __a, __m128 __b)
482{
483 return (__m128)((__v4su)__a | (__v4su)__b);
484}
485
486/// Performs a bitwise exclusive OR of two 128-bit vectors of
487/// [4 x float].
488///
489/// \headerfile <x86intrin.h>
490///
491/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
492///
493/// \param __a
494/// A 128-bit vector of [4 x float] containing one of the source operands.
495/// \param __b
496/// A 128-bit vector of [4 x float] containing one of the source operands.
497/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
498/// of the values between both operands.
499static __inline__ __m128 __DEFAULT_FN_ATTRS
500_mm_xor_ps(__m128 __a, __m128 __b)
501{
502 return (__m128)((__v4su)__a ^ (__v4su)__b);
503}
504
505/// Compares two 32-bit float values in the low-order bits of both
506/// operands for equality.
507///
508/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
509/// low-order bits of a vector [4 x float].
510/// If either value in a comparison is NaN, returns false.
511///
512/// \headerfile <x86intrin.h>
513///
514/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
515///
516/// \param __a
517/// A 128-bit vector of [4 x float] containing one of the operands. The lower
518/// 32 bits of this operand are used in the comparison.
519/// \param __b
520/// A 128-bit vector of [4 x float] containing one of the operands. The lower
521/// 32 bits of this operand are used in the comparison.
522/// \returns A 128-bit vector of [4 x float] containing the comparison results
523/// in the low-order bits.
524static __inline__ __m128 __DEFAULT_FN_ATTRS
525_mm_cmpeq_ss(__m128 __a, __m128 __b)
526{
527 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
528}
529
530/// Compares each of the corresponding 32-bit float values of the
531/// 128-bit vectors of [4 x float] for equality.
532///
533/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
534/// If either value in a comparison is NaN, returns false.
535///
536/// \headerfile <x86intrin.h>
537///
538/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
539///
540/// \param __a
541/// A 128-bit vector of [4 x float].
542/// \param __b
543/// A 128-bit vector of [4 x float].
544/// \returns A 128-bit vector of [4 x float] containing the comparison results.
545static __inline__ __m128 __DEFAULT_FN_ATTRS
546_mm_cmpeq_ps(__m128 __a, __m128 __b)
547{
548 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
549}
550
551/// Compares two 32-bit float values in the low-order bits of both
552/// operands to determine if the value in the first operand is less than the
553/// corresponding value in the second operand.
554///
555/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
556/// low-order bits of a vector of [4 x float].
557/// If either value in a comparison is NaN, returns false.
558///
559/// \headerfile <x86intrin.h>
560///
561/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
562///
563/// \param __a
564/// A 128-bit vector of [4 x float] containing one of the operands. The lower
565/// 32 bits of this operand are used in the comparison.
566/// \param __b
567/// A 128-bit vector of [4 x float] containing one of the operands. The lower
568/// 32 bits of this operand are used in the comparison.
569/// \returns A 128-bit vector of [4 x float] containing the comparison results
570/// in the low-order bits.
571static __inline__ __m128 __DEFAULT_FN_ATTRS
572_mm_cmplt_ss(__m128 __a, __m128 __b)
573{
574 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
575}
576
577/// Compares each of the corresponding 32-bit float values of the
578/// 128-bit vectors of [4 x float] to determine if the values in the first
579/// operand are less than those in the second operand.
580///
581/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
582/// If either value in a comparison is NaN, returns false.
583///
584/// \headerfile <x86intrin.h>
585///
586/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
587///
588/// \param __a
589/// A 128-bit vector of [4 x float].
590/// \param __b
591/// A 128-bit vector of [4 x float].
592/// \returns A 128-bit vector of [4 x float] containing the comparison results.
593static __inline__ __m128 __DEFAULT_FN_ATTRS
594_mm_cmplt_ps(__m128 __a, __m128 __b)
595{
596 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
597}
598
599/// Compares two 32-bit float values in the low-order bits of both
600/// operands to determine if the value in the first operand is less than or
601/// equal to the corresponding value in the second operand.
602///
603/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
604/// the low-order bits of a vector of [4 x float].
605/// If either value in a comparison is NaN, returns false.
606///
607/// \headerfile <x86intrin.h>
608///
609/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
610///
611/// \param __a
612/// A 128-bit vector of [4 x float] containing one of the operands. The lower
613/// 32 bits of this operand are used in the comparison.
614/// \param __b
615/// A 128-bit vector of [4 x float] containing one of the operands. The lower
616/// 32 bits of this operand are used in the comparison.
617/// \returns A 128-bit vector of [4 x float] containing the comparison results
618/// in the low-order bits.
619static __inline__ __m128 __DEFAULT_FN_ATTRS
620_mm_cmple_ss(__m128 __a, __m128 __b)
621{
622 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
623}
624
625/// Compares each of the corresponding 32-bit float values of the
626/// 128-bit vectors of [4 x float] to determine if the values in the first
627/// operand are less than or equal to those in the second operand.
628///
629/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
630/// If either value in a comparison is NaN, returns false.
631///
632/// \headerfile <x86intrin.h>
633///
634/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
635///
636/// \param __a
637/// A 128-bit vector of [4 x float].
638/// \param __b
639/// A 128-bit vector of [4 x float].
640/// \returns A 128-bit vector of [4 x float] containing the comparison results.
641static __inline__ __m128 __DEFAULT_FN_ATTRS
642_mm_cmple_ps(__m128 __a, __m128 __b)
643{
644 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
645}
646
647/// Compares two 32-bit float values in the low-order bits of both
648/// operands to determine if the value in the first operand is greater than
649/// the corresponding value in the second operand.
650///
651/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
652/// low-order bits of a vector of [4 x float].
653/// If either value in a comparison is NaN, returns false.
654///
655/// \headerfile <x86intrin.h>
656///
657/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
658///
659/// \param __a
660/// A 128-bit vector of [4 x float] containing one of the operands. The lower
661/// 32 bits of this operand are used in the comparison.
662/// \param __b
663/// A 128-bit vector of [4 x float] containing one of the operands. The lower
664/// 32 bits of this operand are used in the comparison.
665/// \returns A 128-bit vector of [4 x float] containing the comparison results
666/// in the low-order bits.
667static __inline__ __m128 __DEFAULT_FN_ATTRS
668_mm_cmpgt_ss(__m128 __a, __m128 __b)
669{
670 return (__m128)__builtin_shufflevector((__v4sf)__a,
671 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
672 4, 1, 2, 3);
673}
674
675/// Compares each of the corresponding 32-bit float values of the
676/// 128-bit vectors of [4 x float] to determine if the values in the first
677/// operand are greater than those in the second operand.
678///
679/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
680/// If either value in a comparison is NaN, returns false.
681///
682/// \headerfile <x86intrin.h>
683///
684/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
685///
686/// \param __a
687/// A 128-bit vector of [4 x float].
688/// \param __b
689/// A 128-bit vector of [4 x float].
690/// \returns A 128-bit vector of [4 x float] containing the comparison results.
691static __inline__ __m128 __DEFAULT_FN_ATTRS
692_mm_cmpgt_ps(__m128 __a, __m128 __b)
693{
694 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
695}
696
697/// Compares two 32-bit float values in the low-order bits of both
698/// operands to determine if the value in the first operand is greater than
699/// or equal to the corresponding value in the second operand.
700///
701/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
702/// low-order bits of a vector of [4 x float].
703/// If either value in a comparison is NaN, returns false.
704///
705/// \headerfile <x86intrin.h>
706///
707/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
708///
709/// \param __a
710/// A 128-bit vector of [4 x float] containing one of the operands. The lower
711/// 32 bits of this operand are used in the comparison.
712/// \param __b
713/// A 128-bit vector of [4 x float] containing one of the operands. The lower
714/// 32 bits of this operand are used in the comparison.
715/// \returns A 128-bit vector of [4 x float] containing the comparison results
716/// in the low-order bits.
717static __inline__ __m128 __DEFAULT_FN_ATTRS
718_mm_cmpge_ss(__m128 __a, __m128 __b)
719{
720 return (__m128)__builtin_shufflevector((__v4sf)__a,
721 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
722 4, 1, 2, 3);
723}
724
725/// Compares each of the corresponding 32-bit float values of the
726/// 128-bit vectors of [4 x float] to determine if the values in the first
727/// operand are greater than or equal to those in the second operand.
728///
729/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
730/// If either value in a comparison is NaN, returns false.
731///
732/// \headerfile <x86intrin.h>
733///
734/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
735///
736/// \param __a
737/// A 128-bit vector of [4 x float].
738/// \param __b
739/// A 128-bit vector of [4 x float].
740/// \returns A 128-bit vector of [4 x float] containing the comparison results.
741static __inline__ __m128 __DEFAULT_FN_ATTRS
742_mm_cmpge_ps(__m128 __a, __m128 __b)
743{
744 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
745}
746
747/// Compares two 32-bit float values in the low-order bits of both operands
748/// for inequality.
749///
750/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
751/// low-order bits of a vector of [4 x float].
752/// If either value in a comparison is NaN, returns true.
753///
754/// \headerfile <x86intrin.h>
755///
756/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
757/// instructions.
758///
759/// \param __a
760/// A 128-bit vector of [4 x float] containing one of the operands. The lower
761/// 32 bits of this operand are used in the comparison.
762/// \param __b
763/// A 128-bit vector of [4 x float] containing one of the operands. The lower
764/// 32 bits of this operand are used in the comparison.
765/// \returns A 128-bit vector of [4 x float] containing the comparison results
766/// in the low-order bits.
767static __inline__ __m128 __DEFAULT_FN_ATTRS
768_mm_cmpneq_ss(__m128 __a, __m128 __b)
769{
770 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
771}
772
773/// Compares each of the corresponding 32-bit float values of the
774/// 128-bit vectors of [4 x float] for inequality.
775///
776/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
777/// If either value in a comparison is NaN, returns true.
778///
779/// \headerfile <x86intrin.h>
780///
781/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
782/// instructions.
783///
784/// \param __a
785/// A 128-bit vector of [4 x float].
786/// \param __b
787/// A 128-bit vector of [4 x float].
788/// \returns A 128-bit vector of [4 x float] containing the comparison results.
789static __inline__ __m128 __DEFAULT_FN_ATTRS
790_mm_cmpneq_ps(__m128 __a, __m128 __b)
791{
792 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
793}
794
795/// Compares two 32-bit float values in the low-order bits of both
796/// operands to determine if the value in the first operand is not less than
797/// the corresponding value in the second operand.
798///
799/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
800/// low-order bits of a vector of [4 x float].
801/// If either value in a comparison is NaN, returns true.
802///
803/// \headerfile <x86intrin.h>
804///
805/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
806/// instructions.
807///
808/// \param __a
809/// A 128-bit vector of [4 x float] containing one of the operands. The lower
810/// 32 bits of this operand are used in the comparison.
811/// \param __b
812/// A 128-bit vector of [4 x float] containing one of the operands. The lower
813/// 32 bits of this operand are used in the comparison.
814/// \returns A 128-bit vector of [4 x float] containing the comparison results
815/// in the low-order bits.
816static __inline__ __m128 __DEFAULT_FN_ATTRS
817_mm_cmpnlt_ss(__m128 __a, __m128 __b)
818{
819 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
820}
821
822/// Compares each of the corresponding 32-bit float values of the
823/// 128-bit vectors of [4 x float] to determine if the values in the first
824/// operand are not less than those in the second operand.
825///
826/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
827/// If either value in a comparison is NaN, returns true.
828///
829/// \headerfile <x86intrin.h>
830///
831/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
832/// instructions.
833///
834/// \param __a
835/// A 128-bit vector of [4 x float].
836/// \param __b
837/// A 128-bit vector of [4 x float].
838/// \returns A 128-bit vector of [4 x float] containing the comparison results.
839static __inline__ __m128 __DEFAULT_FN_ATTRS
840_mm_cmpnlt_ps(__m128 __a, __m128 __b)
841{
842 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
843}
844
845/// Compares two 32-bit float values in the low-order bits of both
846/// operands to determine if the value in the first operand is not less than
847/// or equal to the corresponding value in the second operand.
848///
849/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
850/// low-order bits of a vector of [4 x float].
851/// If either value in a comparison is NaN, returns true.
852///
853/// \headerfile <x86intrin.h>
854///
855/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
856/// instructions.
857///
858/// \param __a
859/// A 128-bit vector of [4 x float] containing one of the operands. The lower
860/// 32 bits of this operand are used in the comparison.
861/// \param __b
862/// A 128-bit vector of [4 x float] containing one of the operands. The lower
863/// 32 bits of this operand are used in the comparison.
864/// \returns A 128-bit vector of [4 x float] containing the comparison results
865/// in the low-order bits.
866static __inline__ __m128 __DEFAULT_FN_ATTRS
867_mm_cmpnle_ss(__m128 __a, __m128 __b)
868{
869 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
870}
871
872/// Compares each of the corresponding 32-bit float values of the
873/// 128-bit vectors of [4 x float] to determine if the values in the first
874/// operand are not less than or equal to those in the second operand.
875///
876/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
877/// If either value in a comparison is NaN, returns true.
878///
879/// \headerfile <x86intrin.h>
880///
881/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
882/// instructions.
883///
884/// \param __a
885/// A 128-bit vector of [4 x float].
886/// \param __b
887/// A 128-bit vector of [4 x float].
888/// \returns A 128-bit vector of [4 x float] containing the comparison results.
889static __inline__ __m128 __DEFAULT_FN_ATTRS
890_mm_cmpnle_ps(__m128 __a, __m128 __b)
891{
892 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
893}
894
895/// Compares two 32-bit float values in the low-order bits of both
896/// operands to determine if the value in the first operand is not greater
897/// than the corresponding value in the second operand.
898///
899/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
900/// low-order bits of a vector of [4 x float].
901/// If either value in a comparison is NaN, returns true.
902///
903/// \headerfile <x86intrin.h>
904///
905/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
906/// instructions.
907///
908/// \param __a
909/// A 128-bit vector of [4 x float] containing one of the operands. The lower
910/// 32 bits of this operand are used in the comparison.
911/// \param __b
912/// A 128-bit vector of [4 x float] containing one of the operands. The lower
913/// 32 bits of this operand are used in the comparison.
914/// \returns A 128-bit vector of [4 x float] containing the comparison results
915/// in the low-order bits.
916static __inline__ __m128 __DEFAULT_FN_ATTRS
917_mm_cmpngt_ss(__m128 __a, __m128 __b)
918{
919 return (__m128)__builtin_shufflevector((__v4sf)__a,
920 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
921 4, 1, 2, 3);
922}
923
924/// Compares each of the corresponding 32-bit float values of the
925/// 128-bit vectors of [4 x float] to determine if the values in the first
926/// operand are not greater than those in the second operand.
927///
928/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
929/// If either value in a comparison is NaN, returns true.
930///
931/// \headerfile <x86intrin.h>
932///
933/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
934/// instructions.
935///
936/// \param __a
937/// A 128-bit vector of [4 x float].
938/// \param __b
939/// A 128-bit vector of [4 x float].
940/// \returns A 128-bit vector of [4 x float] containing the comparison results.
941static __inline__ __m128 __DEFAULT_FN_ATTRS
942_mm_cmpngt_ps(__m128 __a, __m128 __b)
943{
944 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
945}
946
947/// Compares two 32-bit float values in the low-order bits of both
948/// operands to determine if the value in the first operand is not greater
949/// than or equal to the corresponding value in the second operand.
950///
951/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
952/// low-order bits of a vector of [4 x float].
953/// If either value in a comparison is NaN, returns true.
954///
955/// \headerfile <x86intrin.h>
956///
957/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
958/// instructions.
959///
960/// \param __a
961/// A 128-bit vector of [4 x float] containing one of the operands. The lower
962/// 32 bits of this operand are used in the comparison.
963/// \param __b
964/// A 128-bit vector of [4 x float] containing one of the operands. The lower
965/// 32 bits of this operand are used in the comparison.
966/// \returns A 128-bit vector of [4 x float] containing the comparison results
967/// in the low-order bits.
968static __inline__ __m128 __DEFAULT_FN_ATTRS
969_mm_cmpnge_ss(__m128 __a, __m128 __b)
970{
971 return (__m128)__builtin_shufflevector((__v4sf)__a,
972 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
973 4, 1, 2, 3);
974}
975
976/// Compares each of the corresponding 32-bit float values of the
977/// 128-bit vectors of [4 x float] to determine if the values in the first
978/// operand are not greater than or equal to those in the second operand.
979///
980/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
981/// If either value in a comparison is NaN, returns true.
982///
983/// \headerfile <x86intrin.h>
984///
985/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
986/// instructions.
987///
988/// \param __a
989/// A 128-bit vector of [4 x float].
990/// \param __b
991/// A 128-bit vector of [4 x float].
992/// \returns A 128-bit vector of [4 x float] containing the comparison results.
993static __inline__ __m128 __DEFAULT_FN_ATTRS
994_mm_cmpnge_ps(__m128 __a, __m128 __b)
995{
996 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
997}
998
999/// Compares two 32-bit float values in the low-order bits of both
1000/// operands to determine if the value in the first operand is ordered with
1001/// respect to the corresponding value in the second operand.
1002///
1003/// A pair of floating-point values are ordered with respect to each
1004/// other if neither value is a NaN. Each comparison returns 0x0 for false,
1005/// 0xFFFFFFFF for true.
1006///
1007/// \headerfile <x86intrin.h>
1008///
1009/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
1010/// instructions.
1011///
1012/// \param __a
1013/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1014/// 32 bits of this operand are used in the comparison.
1015/// \param __b
1016/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1017/// 32 bits of this operand are used in the comparison.
1018/// \returns A 128-bit vector of [4 x float] containing the comparison results
1019/// in the low-order bits.
1020static __inline__ __m128 __DEFAULT_FN_ATTRS
1021_mm_cmpord_ss(__m128 __a, __m128 __b)
1022{
1023 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1024}
1025
1026/// Compares each of the corresponding 32-bit float values of the
1027/// 128-bit vectors of [4 x float] to determine if the values in the first
1028/// operand are ordered with respect to those in the second operand.
1029///
1030/// A pair of floating-point values are ordered with respect to each
1031/// other if neither value is a NaN. Each comparison returns 0x0 for false,
1032/// 0xFFFFFFFF for true.
1033///
1034/// \headerfile <x86intrin.h>
1035///
1036/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1037/// instructions.
1038///
1039/// \param __a
1040/// A 128-bit vector of [4 x float].
1041/// \param __b
1042/// A 128-bit vector of [4 x float].
1043/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1044static __inline__ __m128 __DEFAULT_FN_ATTRS
1045_mm_cmpord_ps(__m128 __a, __m128 __b)
1046{
1047 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1048}
1049
1050/// Compares two 32-bit float values in the low-order bits of both
1051/// operands to determine if the value in the first operand is unordered
1052/// with respect to the corresponding value in the second operand.
1053///
1054/// A pair of double-precision values are unordered with respect to each
1055/// other if one or both values are NaN. Each comparison returns 0x0 for
1056/// false, 0xFFFFFFFF for true.
1057///
1058/// \headerfile <x86intrin.h>
1059///
1060/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1061/// instructions.
1062///
1063/// \param __a
1064/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1065/// 32 bits of this operand are used in the comparison.
1066/// \param __b
1067/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1068/// 32 bits of this operand are used in the comparison.
1069/// \returns A 128-bit vector of [4 x float] containing the comparison results
1070/// in the low-order bits.
1071static __inline__ __m128 __DEFAULT_FN_ATTRS
1072_mm_cmpunord_ss(__m128 __a, __m128 __b)
1073{
1074 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1075}
1076
1077/// Compares each of the corresponding 32-bit float values of the
1078/// 128-bit vectors of [4 x float] to determine if the values in the first
1079/// operand are unordered with respect to those in the second operand.
1080///
1081/// A pair of double-precision values are unordered with respect to each
1082/// other if one or both values are NaN. Each comparison returns 0x0 for
1083/// false, 0xFFFFFFFFFFFFFFFF for true.
1084///
1085/// \headerfile <x86intrin.h>
1086///
1087/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1088/// instructions.
1089///
1090/// \param __a
1091/// A 128-bit vector of [4 x float].
1092/// \param __b
1093/// A 128-bit vector of [4 x float].
1094/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1095static __inline__ __m128 __DEFAULT_FN_ATTRS
1096_mm_cmpunord_ps(__m128 __a, __m128 __b)
1097{
1098 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1099}
1100
1101/// Compares two 32-bit float values in the low-order bits of both
1102/// operands for equality.
1103///
1104/// The comparison returns 0 for false, 1 for true. If either value in a
1105/// comparison is NaN, returns 0.
1106///
1107/// \headerfile <x86intrin.h>
1108///
1109/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1110/// instructions.
1111///
1112/// \param __a
1113/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1114/// used in the comparison.
1115/// \param __b
1116/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1117/// used in the comparison.
1118/// \returns An integer containing the comparison results.
1119static __inline__ int __DEFAULT_FN_ATTRS
1120_mm_comieq_ss(__m128 __a, __m128 __b)
1121{
1122 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1123}
1124
1125/// Compares two 32-bit float values in the low-order bits of both
1126/// operands to determine if the first operand is less than the second
1127/// operand.
1128///
1129/// The comparison returns 0 for false, 1 for true. If either value in a
1130/// comparison is NaN, returns 0.
1131///
1132/// \headerfile <x86intrin.h>
1133///
1134/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1135/// instructions.
1136///
1137/// \param __a
1138/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1139/// used in the comparison.
1140/// \param __b
1141/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1142/// used in the comparison.
1143/// \returns An integer containing the comparison results.
1144static __inline__ int __DEFAULT_FN_ATTRS
1145_mm_comilt_ss(__m128 __a, __m128 __b)
1146{
1147 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1148}
1149
1150/// Compares two 32-bit float values in the low-order bits of both
1151/// operands to determine if the first operand is less than or equal to the
1152/// second operand.
1153///
1154/// The comparison returns 0 for false, 1 for true. If either value in a
1155/// comparison is NaN, returns 0.
1156///
1157/// \headerfile <x86intrin.h>
1158///
1159/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1160///
1161/// \param __a
1162/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1163/// used in the comparison.
1164/// \param __b
1165/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1166/// used in the comparison.
1167/// \returns An integer containing the comparison results.
1168static __inline__ int __DEFAULT_FN_ATTRS
1169_mm_comile_ss(__m128 __a, __m128 __b)
1170{
1171 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1172}
1173
1174/// Compares two 32-bit float values in the low-order bits of both
1175/// operands to determine if the first operand is greater than the second
1176/// operand.
1177///
1178/// The comparison returns 0 for false, 1 for true. If either value in a
1179/// comparison is NaN, returns 0.
1180///
1181/// \headerfile <x86intrin.h>
1182///
1183/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1184///
1185/// \param __a
1186/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1187/// used in the comparison.
1188/// \param __b
1189/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1190/// used in the comparison.
1191/// \returns An integer containing the comparison results.
1192static __inline__ int __DEFAULT_FN_ATTRS
1193_mm_comigt_ss(__m128 __a, __m128 __b)
1194{
1195 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1196}
1197
1198/// Compares two 32-bit float values in the low-order bits of both
1199/// operands to determine if the first operand is greater than or equal to
1200/// the second operand.
1201///
1202/// The comparison returns 0 for false, 1 for true. If either value in a
1203/// comparison is NaN, returns 0.
1204///
1205/// \headerfile <x86intrin.h>
1206///
1207/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1208///
1209/// \param __a
1210/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1211/// used in the comparison.
1212/// \param __b
1213/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1214/// used in the comparison.
1215/// \returns An integer containing the comparison results.
1216static __inline__ int __DEFAULT_FN_ATTRS
1217_mm_comige_ss(__m128 __a, __m128 __b)
1218{
1219 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1220}
1221
1222/// Compares two 32-bit float values in the low-order bits of both
1223/// operands to determine if the first operand is not equal to the second
1224/// operand.
1225///
1226/// The comparison returns 0 for false, 1 for true. If either value in a
1227/// comparison is NaN, returns 1.
1228///
1229/// \headerfile <x86intrin.h>
1230///
1231/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1232///
1233/// \param __a
1234/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1235/// used in the comparison.
1236/// \param __b
1237/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1238/// used in the comparison.
1239/// \returns An integer containing the comparison results.
1240static __inline__ int __DEFAULT_FN_ATTRS
1241_mm_comineq_ss(__m128 __a, __m128 __b)
1242{
1243 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1244}
1245
1246/// Performs an unordered comparison of two 32-bit float values using
1247/// the low-order bits of both operands to determine equality.
1248///
1249/// The comparison returns 0 for false, 1 for true. If either value in a
1250/// comparison is NaN, returns 0.
1251///
1252/// \headerfile <x86intrin.h>
1253///
1254/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1255///
1256/// \param __a
1257/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1258/// used in the comparison.
1259/// \param __b
1260/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1261/// used in the comparison.
1262/// \returns An integer containing the comparison results.
1263static __inline__ int __DEFAULT_FN_ATTRS
1264_mm_ucomieq_ss(__m128 __a, __m128 __b)
1265{
1266 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1267}
1268
1269/// Performs an unordered comparison of two 32-bit float values using
1270/// the low-order bits of both operands to determine if the first operand is
1271/// less than the second operand.
1272///
1273/// The comparison returns 0 for false, 1 for true. If either value in a
1274/// comparison is NaN, returns 0.
1275///
1276/// \headerfile <x86intrin.h>
1277///
1278/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1279///
1280/// \param __a
1281/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1282/// used in the comparison.
1283/// \param __b
1284/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1285/// used in the comparison.
1286/// \returns An integer containing the comparison results.
1287static __inline__ int __DEFAULT_FN_ATTRS
1288_mm_ucomilt_ss(__m128 __a, __m128 __b)
1289{
1290 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1291}
1292
1293/// Performs an unordered comparison of two 32-bit float values using
1294/// the low-order bits of both operands to determine if the first operand is
1295/// less than or equal to the second operand.
1296///
1297/// The comparison returns 0 for false, 1 for true. If either value in a
1298/// comparison is NaN, returns 0.
1299///
1300/// \headerfile <x86intrin.h>
1301///
1302/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1303///
1304/// \param __a
1305/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1306/// used in the comparison.
1307/// \param __b
1308/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1309/// used in the comparison.
1310/// \returns An integer containing the comparison results.
1311static __inline__ int __DEFAULT_FN_ATTRS
1312_mm_ucomile_ss(__m128 __a, __m128 __b)
1313{
1314 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1315}
1316
1317/// Performs an unordered comparison of two 32-bit float values using
1318/// the low-order bits of both operands to determine if the first operand is
1319/// greater than the second operand.
1320///
1321/// The comparison returns 0 for false, 1 for true. If either value in a
1322/// comparison is NaN, returns 0.
1323///
1324/// \headerfile <x86intrin.h>
1325///
1326/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1327///
1328/// \param __a
1329/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1330/// used in the comparison.
1331/// \param __b
1332/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1333/// used in the comparison.
1334/// \returns An integer containing the comparison results.
1335static __inline__ int __DEFAULT_FN_ATTRS
1336_mm_ucomigt_ss(__m128 __a, __m128 __b)
1337{
1338 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1339}
1340
1341/// Performs an unordered comparison of two 32-bit float values using
1342/// the low-order bits of both operands to determine if the first operand is
1343/// greater than or equal to the second operand.
1344///
1345/// The comparison returns 0 for false, 1 for true. If either value in a
1346/// comparison is NaN, returns 0.
1347///
1348/// \headerfile <x86intrin.h>
1349///
1350/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1351///
1352/// \param __a
1353/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1354/// used in the comparison.
1355/// \param __b
1356/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1357/// used in the comparison.
1358/// \returns An integer containing the comparison results.
1359static __inline__ int __DEFAULT_FN_ATTRS
1360_mm_ucomige_ss(__m128 __a, __m128 __b)
1361{
1362 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1363}
1364
1365/// Performs an unordered comparison of two 32-bit float values using
1366/// the low-order bits of both operands to determine inequality.
1367///
1368/// The comparison returns 0 for false, 1 for true. If either value in a
1369/// comparison is NaN, returns 0.
1370///
1371/// \headerfile <x86intrin.h>
1372///
1373/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1374///
1375/// \param __a
1376/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1377/// used in the comparison.
1378/// \param __b
1379/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1380/// used in the comparison.
1381/// \returns An integer containing the comparison results.
1382static __inline__ int __DEFAULT_FN_ATTRS
1383_mm_ucomineq_ss(__m128 __a, __m128 __b)
1384{
1385 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1386}
1387
1388/// Converts a float value contained in the lower 32 bits of a vector of
1389/// [4 x float] into a 32-bit integer.
1390///
1391/// If the converted value does not fit in a 32-bit integer, raises a
1392/// floating-point invalid exception. If the exception is masked, returns
1393/// the most negative integer.
1394///
1395/// \headerfile <x86intrin.h>
1396///
1397/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1398/// instructions.
1399///
1400/// \param __a
1401/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1402/// used in the conversion.
1403/// \returns A 32-bit integer containing the converted value.
1404static __inline__ int __DEFAULT_FN_ATTRS
1406{
1407 return __builtin_ia32_cvtss2si((__v4sf)__a);
1408}
1409
1410/// Converts a float value contained in the lower 32 bits of a vector of
1411/// [4 x float] into a 32-bit integer.
1412///
1413/// If the converted value does not fit in a 32-bit integer, raises a
1414/// floating-point invalid exception. If the exception is masked, returns
1415/// the most negative integer.
1416///
1417/// \headerfile <x86intrin.h>
1418///
1419/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1420/// instructions.
1421///
1422/// \param __a
1423/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1424/// used in the conversion.
1425/// \returns A 32-bit integer containing the converted value.
1426static __inline__ int __DEFAULT_FN_ATTRS
1428{
1429 return _mm_cvtss_si32(__a);
1430}
1431
1432#ifdef __x86_64__
1433
1434/// Converts a float value contained in the lower 32 bits of a vector of
1435/// [4 x float] into a 64-bit integer.
1436///
1437/// If the converted value does not fit in a 32-bit integer, raises a
1438/// floating-point invalid exception. If the exception is masked, returns
1439/// the most negative integer.
1440///
1441/// \headerfile <x86intrin.h>
1442///
1443/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1444/// instructions.
1445///
1446/// \param __a
1447/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1448/// used in the conversion.
1449/// \returns A 64-bit integer containing the converted value.
1450static __inline__ long long __DEFAULT_FN_ATTRS
1451_mm_cvtss_si64(__m128 __a)
1452{
1453 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1454}
1455
1456#endif
1457
1458/// Converts two low-order float values in a 128-bit vector of
1459/// [4 x float] into a 64-bit vector of [2 x i32].
1460///
1461/// If a converted value does not fit in a 32-bit integer, raises a
1462/// floating-point invalid exception. If the exception is masked, returns
1463/// the most negative integer.
1464///
1465/// \headerfile <x86intrin.h>
1466///
1467/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1468///
1469/// \param __a
1470/// A 128-bit vector of [4 x float].
1471/// \returns A 64-bit integer vector containing the converted values.
1472static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1474{
1475 return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1476}
1477
1478/// Converts two low-order float values in a 128-bit vector of
1479/// [4 x float] into a 64-bit vector of [2 x i32].
1480///
1481/// If a converted value does not fit in a 32-bit integer, raises a
1482/// floating-point invalid exception. If the exception is masked, returns
1483/// the most negative integer.
1484///
1485/// \headerfile <x86intrin.h>
1486///
1487/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1488///
1489/// \param __a
1490/// A 128-bit vector of [4 x float].
1491/// \returns A 64-bit integer vector containing the converted values.
1492static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1494{
1495 return _mm_cvtps_pi32(__a);
1496}
1497
1498/// Converts the lower (first) element of a vector of [4 x float] into a signed
1499/// truncated (rounded toward zero) 32-bit integer.
1500///
1501/// If the converted value does not fit in a 32-bit integer, raises a
1502/// floating-point invalid exception. If the exception is masked, returns
1503/// the most negative integer.
1504///
1505/// \headerfile <x86intrin.h>
1506///
1507/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1508/// instructions.
1509///
1510/// \param __a
1511/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1512/// used in the conversion.
1513/// \returns A 32-bit integer containing the converted value.
1514static __inline__ int __DEFAULT_FN_ATTRS
1516{
1517 return __builtin_ia32_cvttss2si((__v4sf)__a);
1518}
1519
1520/// Converts the lower (first) element of a vector of [4 x float] into a signed
1521/// truncated (rounded toward zero) 32-bit integer.
1522///
1523/// If the converted value does not fit in a 32-bit integer, raises a
1524/// floating-point invalid exception. If the exception is masked, returns
1525/// the most negative integer.
1526///
1527/// \headerfile <x86intrin.h>
1528///
1529/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1530/// instructions.
1531///
1532/// \param __a
1533/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1534/// used in the conversion.
1535/// \returns A 32-bit integer containing the converted value.
1536static __inline__ int __DEFAULT_FN_ATTRS
1538{
1539 return _mm_cvttss_si32(__a);
1540}
1541
1542#ifdef __x86_64__
1543/// Converts the lower (first) element of a vector of [4 x float] into a signed
1544/// truncated (rounded toward zero) 64-bit integer.
1545///
1546/// If the converted value does not fit in a 64-bit integer, raises a
1547/// floating-point invalid exception. If the exception is masked, returns
1548/// the most negative integer.
1549///
1550/// \headerfile <x86intrin.h>
1551///
1552/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1553/// instructions.
1554///
1555/// \param __a
1556/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1557/// used in the conversion.
1558/// \returns A 64-bit integer containing the converted value.
1559static __inline__ long long __DEFAULT_FN_ATTRS
1560_mm_cvttss_si64(__m128 __a)
1561{
1562 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1563}
1564#endif
1565
1566/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1567/// into two signed truncated (rounded toward zero) 32-bit integers,
1568/// returned in a 64-bit vector of [2 x i32].
1569///
1570/// If a converted value does not fit in a 32-bit integer, raises a
1571/// floating-point invalid exception. If the exception is masked, returns
1572/// the most negative integer.
1573///
1574/// \headerfile <x86intrin.h>
1575///
1576/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1577/// instructions.
1578///
1579/// \param __a
1580/// A 128-bit vector of [4 x float].
1581/// \returns A 64-bit integer vector containing the converted values.
1582static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1584{
1585 return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1586}
1587
1588/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1589/// into two signed truncated (rounded toward zero) 64-bit integers,
1590/// returned in a 64-bit vector of [2 x i32].
1591///
1592/// If a converted value does not fit in a 32-bit integer, raises a
1593/// floating-point invalid exception. If the exception is masked, returns
1594/// the most negative integer.
1595///
1596/// \headerfile <x86intrin.h>
1597///
1598/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1599///
1600/// \param __a
1601/// A 128-bit vector of [4 x float].
1602/// \returns A 64-bit integer vector containing the converted values.
1603static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1605{
1606 return _mm_cvttps_pi32(__a);
1607}
1608
1609/// Converts a 32-bit signed integer value into a floating point value
1610/// and writes it to the lower 32 bits of the destination. The remaining
1611/// higher order elements of the destination vector are copied from the
1612/// corresponding elements in the first operand.
1613///
1614/// \headerfile <x86intrin.h>
1615///
1616/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1617///
1618/// \param __a
1619/// A 128-bit vector of [4 x float].
1620/// \param __b
1621/// A 32-bit signed integer operand containing the value to be converted.
1622/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1623/// converted value of the second operand. The upper 96 bits are copied from
1624/// the upper 96 bits of the first operand.
1625static __inline__ __m128 __DEFAULT_FN_ATTRS
1627{
1628 __a[0] = __b;
1629 return __a;
1630}
1631
1632/// Converts a 32-bit signed integer value into a floating point value
1633/// and writes it to the lower 32 bits of the destination. The remaining
1634/// higher order elements of the destination are copied from the
1635/// corresponding elements in the first operand.
1636///
1637/// \headerfile <x86intrin.h>
1638///
1639/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1640///
1641/// \param __a
1642/// A 128-bit vector of [4 x float].
1643/// \param __b
1644/// A 32-bit signed integer operand containing the value to be converted.
1645/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1646/// converted value of the second operand. The upper 96 bits are copied from
1647/// the upper 96 bits of the first operand.
1648static __inline__ __m128 __DEFAULT_FN_ATTRS
1650{
1651 return _mm_cvtsi32_ss(__a, __b);
1652}
1653
1654#ifdef __x86_64__
1655
1656/// Converts a 64-bit signed integer value into a floating point value
1657/// and writes it to the lower 32 bits of the destination. The remaining
1658/// higher order elements of the destination are copied from the
1659/// corresponding elements in the first operand.
1660///
1661/// \headerfile <x86intrin.h>
1662///
1663/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1664///
1665/// \param __a
1666/// A 128-bit vector of [4 x float].
1667/// \param __b
1668/// A 64-bit signed integer operand containing the value to be converted.
1669/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1670/// converted value of the second operand. The upper 96 bits are copied from
1671/// the upper 96 bits of the first operand.
1672static __inline__ __m128 __DEFAULT_FN_ATTRS
1673_mm_cvtsi64_ss(__m128 __a, long long __b)
1674{
1675 __a[0] = __b;
1676 return __a;
1677}
1678
1679#endif
1680
1681/// Converts two elements of a 64-bit vector of [2 x i32] into two
1682/// floating point values and writes them to the lower 64-bits of the
1683/// destination. The remaining higher order elements of the destination are
1684/// copied from the corresponding elements in the first operand.
1685///
1686/// \headerfile <x86intrin.h>
1687///
1688/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1689///
1690/// \param __a
1691/// A 128-bit vector of [4 x float].
1692/// \param __b
1693/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1694/// and written to the corresponding low-order elements in the destination.
1695/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1696/// converted value of the second operand. The upper 64 bits are copied from
1697/// the upper 64 bits of the first operand.
1698static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
1699_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1700{
1701 return (__m128)__builtin_shufflevector(
1702 (__v4sf)__a,
1703 __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1704 4, 5, 2, 3);
1705}
1706
1707/// Converts two elements of a 64-bit vector of [2 x i32] into two
1708/// floating point values and writes them to the lower 64-bits of the
1709/// destination. The remaining higher order elements of the destination are
1710/// copied from the corresponding elements in the first operand.
1711///
1712/// \headerfile <x86intrin.h>
1713///
1714/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1715///
1716/// \param __a
1717/// A 128-bit vector of [4 x float].
1718/// \param __b
1719/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1720/// and written to the corresponding low-order elements in the destination.
1721/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1722/// converted value from the second operand. The upper 64 bits are copied
1723/// from the upper 64 bits of the first operand.
1724static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
1725_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1726{
1727 return _mm_cvtpi32_ps(__a, __b);
1728}
1729
1730/// Extracts a float value contained in the lower 32 bits of a vector of
1731/// [4 x float].
1732///
1733/// \headerfile <x86intrin.h>
1734///
1735/// This intrinsic has no corresponding instruction.
1736///
1737/// \param __a
1738/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1739/// used in the extraction.
1740/// \returns A 32-bit float containing the extracted value.
1741static __inline__ float __DEFAULT_FN_ATTRS
1743{
1744 return __a[0];
1745}
1746
1747/// Loads two packed float values from the address \a __p into the
1748/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1749/// are copied from the low-order bits of the first operand.
1750///
1751/// \headerfile <x86intrin.h>
1752///
1753/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1754///
1755/// \param __a
1756/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1757/// of the destination.
1758/// \param __p
1759/// A pointer to two packed float values. Bits [63:0] are written to bits
1760/// [127:64] of the destination.
1761/// \returns A 128-bit vector of [4 x float] containing the moved values.
1762static __inline__ __m128 __DEFAULT_FN_ATTRS
1763_mm_loadh_pi(__m128 __a, const __m64 *__p)
1764{
1765 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1766 struct __mm_loadh_pi_struct {
1767 __mm_loadh_pi_v2f32 __u;
1768 } __attribute__((__packed__, __may_alias__));
1769 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1770 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1771 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1772}
1773
1774/// Loads two packed float values from the address \a __p into the
1775/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1776/// are copied from the high-order bits of the first operand.
1777///
1778/// \headerfile <x86intrin.h>
1779///
1780/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1781///
1782/// \param __a
1783/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1784/// [127:64] of the destination.
1785/// \param __p
1786/// A pointer to two packed float values. Bits [63:0] are written to bits
1787/// [63:0] of the destination.
1788/// \returns A 128-bit vector of [4 x float] containing the moved values.
1789static __inline__ __m128 __DEFAULT_FN_ATTRS
1790_mm_loadl_pi(__m128 __a, const __m64 *__p)
1791{
1792 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1793 struct __mm_loadl_pi_struct {
1794 __mm_loadl_pi_v2f32 __u;
1795 } __attribute__((__packed__, __may_alias__));
1796 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1797 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1798 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1799}
1800
1801/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1802/// 32 bits of the vector are initialized with the single-precision
1803/// floating-point value loaded from a specified memory location. The upper
1804/// 96 bits are set to zero.
1805///
1806/// \headerfile <x86intrin.h>
1807///
1808/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1809///
1810/// \param __p
1811/// A pointer to a 32-bit memory location containing a single-precision
1812/// floating-point value.
1813/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1814/// lower 32 bits contain the value loaded from the memory location. The
1815/// upper 96 bits are set to zero.
1816static __inline__ __m128 __DEFAULT_FN_ATTRS
1817_mm_load_ss(const float *__p)
1818{
1819 struct __mm_load_ss_struct {
1820 float __u;
1821 } __attribute__((__packed__, __may_alias__));
1822 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1823 return __extension__ (__m128){ __u, 0, 0, 0 };
1824}
1825
1826/// Loads a 32-bit float value and duplicates it to all four vector
1827/// elements of a 128-bit vector of [4 x float].
1828///
1829/// \headerfile <x86intrin.h>
1830///
1831/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1832/// instruction.
1833///
1834/// \param __p
1835/// A pointer to a float value to be loaded and duplicated.
1836/// \returns A 128-bit vector of [4 x float] containing the loaded and
1837/// duplicated values.
1838static __inline__ __m128 __DEFAULT_FN_ATTRS
1839_mm_load1_ps(const float *__p)
1840{
1841 struct __mm_load1_ps_struct {
1842 float __u;
1843 } __attribute__((__packed__, __may_alias__));
1844 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1845 return __extension__ (__m128){ __u, __u, __u, __u };
1846}
1847
1848#define _mm_load_ps1(p) _mm_load1_ps(p)
1849
1850/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1851/// memory location.
1852///
1853/// \headerfile <x86intrin.h>
1854///
1855/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1856///
1857/// \param __p
1858/// A pointer to a 128-bit memory location. The address of the memory
1859/// location has to be 128-bit aligned.
1860/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1861static __inline__ __m128 __DEFAULT_FN_ATTRS
1862_mm_load_ps(const float *__p)
1863{
1864 return *(const __m128*)__p;
1865}
1866
1867/// Loads a 128-bit floating-point vector of [4 x float] from an
1868/// unaligned memory location.
1869///
1870/// \headerfile <x86intrin.h>
1871///
1872/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1873///
1874/// \param __p
1875/// A pointer to a 128-bit memory location. The address of the memory
1876/// location does not have to be aligned.
1877/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1878static __inline__ __m128 __DEFAULT_FN_ATTRS
1879_mm_loadu_ps(const float *__p)
1880{
1881 struct __loadu_ps {
1882 __m128_u __v;
1883 } __attribute__((__packed__, __may_alias__));
1884 return ((const struct __loadu_ps*)__p)->__v;
1885}
1886
1887/// Loads four packed float values, in reverse order, from an aligned
1888/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1889///
1890/// \headerfile <x86intrin.h>
1891///
1892/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1893/// instruction.
1894///
1895/// \param __p
1896/// A pointer to a 128-bit memory location. The address of the memory
1897/// location has to be 128-bit aligned.
1898/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1899/// in reverse order.
1900static __inline__ __m128 __DEFAULT_FN_ATTRS
1901_mm_loadr_ps(const float *__p)
1902{
1903 __m128 __a = _mm_load_ps(__p);
1904 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1905}
1906
1907/// Create a 128-bit vector of [4 x float] with undefined values.
1908///
1909/// \headerfile <x86intrin.h>
1910///
1911/// This intrinsic has no corresponding instruction.
1912///
1913/// \returns A 128-bit vector of [4 x float] containing undefined values.
1914static __inline__ __m128 __DEFAULT_FN_ATTRS
1916{
1917 return (__m128)__builtin_ia32_undef128();
1918}
1919
1920/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1921/// 32 bits of the vector are initialized with the specified single-precision
1922/// floating-point value. The upper 96 bits are set to zero.
1923///
1924/// \headerfile <x86intrin.h>
1925///
1926/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1927///
1928/// \param __w
1929/// A single-precision floating-point value used to initialize the lower 32
1930/// bits of the result.
1931/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1932/// lower 32 bits contain the value provided in the source operand. The
1933/// upper 96 bits are set to zero.
1934static __inline__ __m128 __DEFAULT_FN_ATTRS
1935_mm_set_ss(float __w)
1936{
1937 return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1938}
1939
1940/// Constructs a 128-bit floating-point vector of [4 x float], with each
1941/// of the four single-precision floating-point vector elements set to the
1942/// specified single-precision floating-point value.
1943///
1944/// \headerfile <x86intrin.h>
1945///
1946/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1947///
1948/// \param __w
1949/// A single-precision floating-point value used to initialize each vector
1950/// element of the result.
1951/// \returns An initialized 128-bit floating-point vector of [4 x float].
1952static __inline__ __m128 __DEFAULT_FN_ATTRS
1953_mm_set1_ps(float __w)
1954{
1955 return __extension__ (__m128){ __w, __w, __w, __w };
1956}
1957
1958/* Microsoft specific. */
1959/// Constructs a 128-bit floating-point vector of [4 x float], with each
1960/// of the four single-precision floating-point vector elements set to the
1961/// specified single-precision floating-point value.
1962///
1963/// \headerfile <x86intrin.h>
1964///
1965/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1966///
1967/// \param __w
1968/// A single-precision floating-point value used to initialize each vector
1969/// element of the result.
1970/// \returns An initialized 128-bit floating-point vector of [4 x float].
1971static __inline__ __m128 __DEFAULT_FN_ATTRS
1972_mm_set_ps1(float __w)
1973{
1974 return _mm_set1_ps(__w);
1975}
1976
1977/// Constructs a 128-bit floating-point vector of [4 x float]
1978/// initialized with the specified single-precision floating-point values.
1979///
1980/// \headerfile <x86intrin.h>
1981///
1982/// This intrinsic is a utility function and does not correspond to a specific
1983/// instruction.
1984///
1985/// \param __z
1986/// A single-precision floating-point value used to initialize bits [127:96]
1987/// of the result.
1988/// \param __y
1989/// A single-precision floating-point value used to initialize bits [95:64]
1990/// of the result.
1991/// \param __x
1992/// A single-precision floating-point value used to initialize bits [63:32]
1993/// of the result.
1994/// \param __w
1995/// A single-precision floating-point value used to initialize bits [31:0]
1996/// of the result.
1997/// \returns An initialized 128-bit floating-point vector of [4 x float].
1998static __inline__ __m128 __DEFAULT_FN_ATTRS
1999_mm_set_ps(float __z, float __y, float __x, float __w)
2000{
2001 return __extension__ (__m128){ __w, __x, __y, __z };
2002}
2003
2004/// Constructs a 128-bit floating-point vector of [4 x float],
2005/// initialized in reverse order with the specified 32-bit single-precision
2006/// float-point values.
2007///
2008/// \headerfile <x86intrin.h>
2009///
2010/// This intrinsic is a utility function and does not correspond to a specific
2011/// instruction.
2012///
2013/// \param __z
2014/// A single-precision floating-point value used to initialize bits [31:0]
2015/// of the result.
2016/// \param __y
2017/// A single-precision floating-point value used to initialize bits [63:32]
2018/// of the result.
2019/// \param __x
2020/// A single-precision floating-point value used to initialize bits [95:64]
2021/// of the result.
2022/// \param __w
2023/// A single-precision floating-point value used to initialize bits [127:96]
2024/// of the result.
2025/// \returns An initialized 128-bit floating-point vector of [4 x float].
2026static __inline__ __m128 __DEFAULT_FN_ATTRS
2027_mm_setr_ps(float __z, float __y, float __x, float __w)
2028{
2029 return __extension__ (__m128){ __z, __y, __x, __w };
2030}
2031
2032/// Constructs a 128-bit floating-point vector of [4 x float] initialized
2033/// to zero.
2034///
2035/// \headerfile <x86intrin.h>
2036///
2037/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2038///
2039/// \returns An initialized 128-bit floating-point vector of [4 x float] with
2040/// all elements set to zero.
2041static __inline__ __m128 __DEFAULT_FN_ATTRS
2043{
2044 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2045}
2046
2047/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2048/// memory location.
2049///
2050/// \headerfile <x86intrin.h>
2051///
2052/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2053///
2054/// \param __p
2055/// A pointer to a 64-bit memory location.
2056/// \param __a
2057/// A 128-bit vector of [4 x float] containing the values to be stored.
2058static __inline__ void __DEFAULT_FN_ATTRS
2059_mm_storeh_pi(__m64 *__p, __m128 __a)
2060{
2061 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2062 struct __mm_storeh_pi_struct {
2063 __mm_storeh_pi_v2f32 __u;
2064 } __attribute__((__packed__, __may_alias__));
2065 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2066}
2067
2068/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2069/// memory location.
2070///
2071/// \headerfile <x86intrin.h>
2072///
2073/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2074///
2075/// \param __p
2076/// A pointer to a memory location that will receive the float values.
2077/// \param __a
2078/// A 128-bit vector of [4 x float] containing the values to be stored.
2079static __inline__ void __DEFAULT_FN_ATTRS
2080_mm_storel_pi(__m64 *__p, __m128 __a)
2081{
2082 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2083 struct __mm_storeh_pi_struct {
2084 __mm_storeh_pi_v2f32 __u;
2085 } __attribute__((__packed__, __may_alias__));
2086 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2087}
2088
2089/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2090/// memory location.
2091///
2092/// \headerfile <x86intrin.h>
2093///
2094/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2095///
2096/// \param __p
2097/// A pointer to a 32-bit memory location.
2098/// \param __a
2099/// A 128-bit vector of [4 x float] containing the value to be stored.
2100static __inline__ void __DEFAULT_FN_ATTRS
2101_mm_store_ss(float *__p, __m128 __a)
2102{
2103 struct __mm_store_ss_struct {
2104 float __u;
2105 } __attribute__((__packed__, __may_alias__));
2106 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2107}
2108
2109/// Stores a 128-bit vector of [4 x float] to an unaligned memory
2110/// location.
2111///
2112/// \headerfile <x86intrin.h>
2113///
2114/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2115///
2116/// \param __p
2117/// A pointer to a 128-bit memory location. The address of the memory
2118/// location does not have to be aligned.
2119/// \param __a
2120/// A 128-bit vector of [4 x float] containing the values to be stored.
2121static __inline__ void __DEFAULT_FN_ATTRS
2122_mm_storeu_ps(float *__p, __m128 __a)
2123{
2124 struct __storeu_ps {
2125 __m128_u __v;
2126 } __attribute__((__packed__, __may_alias__));
2127 ((struct __storeu_ps*)__p)->__v = __a;
2128}
2129
2130/// Stores a 128-bit vector of [4 x float] into an aligned memory
2131/// location.
2132///
2133/// \headerfile <x86intrin.h>
2134///
2135/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2136///
2137/// \param __p
2138/// A pointer to a 128-bit memory location. The address of the memory
2139/// location has to be 16-byte aligned.
2140/// \param __a
2141/// A 128-bit vector of [4 x float] containing the values to be stored.
2142static __inline__ void __DEFAULT_FN_ATTRS
2143_mm_store_ps(float *__p, __m128 __a)
2144{
2145 *(__m128*)__p = __a;
2146}
2147
2148/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2149/// four contiguous elements in an aligned memory location.
2150///
2151/// \headerfile <x86intrin.h>
2152///
2153/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2154/// instruction.
2155///
2156/// \param __p
2157/// A pointer to a 128-bit memory location.
2158/// \param __a
2159/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2160/// of the four contiguous elements pointed by \a __p.
2161static __inline__ void __DEFAULT_FN_ATTRS
2162_mm_store1_ps(float *__p, __m128 __a)
2163{
2164 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2166}
2167
2168/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2169/// four contiguous elements in an aligned memory location.
2170///
2171/// \headerfile <x86intrin.h>
2172///
2173/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2174/// instruction.
2175///
2176/// \param __p
2177/// A pointer to a 128-bit memory location.
2178/// \param __a
2179/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2180/// of the four contiguous elements pointed by \a __p.
2181static __inline__ void __DEFAULT_FN_ATTRS
2182_mm_store_ps1(float *__p, __m128 __a)
2183{
2185}
2186
2187/// Stores float values from a 128-bit vector of [4 x float] to an
2188/// aligned memory location in reverse order.
2189///
2190/// \headerfile <x86intrin.h>
2191///
2192/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2193/// instruction.
2194///
2195/// \param __p
2196/// A pointer to a 128-bit memory location. The address of the memory
2197/// location has to be 128-bit aligned.
2198/// \param __a
2199/// A 128-bit vector of [4 x float] containing the values to be stored.
2200static __inline__ void __DEFAULT_FN_ATTRS
2201_mm_storer_ps(float *__p, __m128 __a)
2202{
2203 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2205}
2206
2207#define _MM_HINT_ET0 7
2208#define _MM_HINT_ET1 6
2209#define _MM_HINT_T0 3
2210#define _MM_HINT_T1 2
2211#define _MM_HINT_T2 1
2212#define _MM_HINT_NTA 0
2213
2214#ifndef _MSC_VER
2215/* FIXME: We have to #define this because "sel" must be a constant integer, and
2216 Sema doesn't do any form of constant propagation yet. */
2217
2218/// Loads one cache line of data from the specified address to a location
2219/// closer to the processor.
2220///
2221/// \headerfile <x86intrin.h>
2222///
2223/// \code
2224/// void _mm_prefetch(const void *a, const int sel);
2225/// \endcode
2226///
2227/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2228///
2229/// \param a
2230/// A pointer to a memory location containing a cache line of data.
2231/// \param sel
2232/// A predefined integer constant specifying the type of prefetch
2233/// operation: \n
2234/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2235/// PREFETCHNTA instruction will be generated. \n
2236/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2237/// be generated. \n
2238/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2239/// be generated. \n
2240/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2241/// be generated.
2242#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2243 ((sel) >> 2) & 1, (sel) & 0x3))
2244#endif
2245
2246/// Stores a 64-bit integer in the specified aligned memory location. To
2247/// minimize caching, the data is flagged as non-temporal (unlikely to be
2248/// used again soon).
2249///
2250/// \headerfile <x86intrin.h>
2251///
2252/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2253///
2254/// \param __p
2255/// A pointer to an aligned memory location used to store the register value.
2256/// \param __a
2257/// A 64-bit integer containing the value to be stored.
2258static __inline__ void __DEFAULT_FN_ATTRS
2259_mm_stream_pi(void *__p, __m64 __a)
2260{
2261 __builtin_nontemporal_store(__a, (__m64 *)__p);
2262}
2263
2264/// Moves packed float values from a 128-bit vector of [4 x float] to a
2265/// 128-bit aligned memory location. To minimize caching, the data is flagged
2266/// as non-temporal (unlikely to be used again soon).
2267///
2268/// \headerfile <x86intrin.h>
2269///
2270/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2271///
2272/// \param __p
2273/// A pointer to a 128-bit aligned memory location that will receive the
2274/// single-precision floating-point values.
2275/// \param __a
2276/// A 128-bit vector of [4 x float] containing the values to be moved.
2277static __inline__ void __DEFAULT_FN_ATTRS
2278_mm_stream_ps(void *__p, __m128 __a)
2279{
2280 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2281}
2282
2283#if defined(__cplusplus)
2284extern "C" {
2285#endif
2286
2287/// Forces strong memory ordering (serialization) between store
2288/// instructions preceding this instruction and store instructions following
2289/// this instruction, ensuring the system completes all previous stores
2290/// before executing subsequent stores.
2291///
2292/// \headerfile <x86intrin.h>
2293///
2294/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2295///
2296void _mm_sfence(void);
2297
2298#if defined(__cplusplus)
2299} // extern "C"
2300#endif
2301
2302/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2303/// returns it, as specified by the immediate integer operand.
2304///
2305/// \headerfile <x86intrin.h>
2306///
2307/// \code
2308/// int _mm_extract_pi16(__m64 a, int n);
2309/// \endcode
2310///
2311/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2312///
2313/// \param a
2314/// A 64-bit vector of [4 x i16].
2315/// \param n
2316/// An immediate integer operand that determines which bits are extracted: \n
2317/// 0: Bits [15:0] are copied to the destination. \n
2318/// 1: Bits [31:16] are copied to the destination. \n
2319/// 2: Bits [47:32] are copied to the destination. \n
2320/// 3: Bits [63:48] are copied to the destination.
2321/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2322#define _mm_extract_pi16(a, n) \
2323 ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2324
2325/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2326/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2327/// specified by the immediate operand \a n.
2328///
2329/// \headerfile <x86intrin.h>
2330///
2331/// \code
2332/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2333/// \endcode
2334///
2335/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2336///
2337/// \param a
2338/// A 64-bit vector of [4 x i16].
2339/// \param d
2340/// An integer. The lower 16-bit value from this operand is written to the
2341/// destination at the offset specified by operand \a n.
2342/// \param n
2343/// An immediate integer operant that determines which the bits to be used
2344/// in the destination. \n
2345/// 0: Bits [15:0] are copied to the destination. \n
2346/// 1: Bits [31:16] are copied to the destination. \n
2347/// 2: Bits [47:32] are copied to the destination. \n
2348/// 3: Bits [63:48] are copied to the destination. \n
2349/// The remaining bits in the destination are copied from the corresponding
2350/// bits in operand \a a.
2351/// \returns A 64-bit integer vector containing the copied packed data from the
2352/// operands.
2353#define _mm_insert_pi16(a, d, n) \
2354 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2355
2356/// Compares each of the corresponding packed 16-bit integer values of
2357/// the 64-bit integer vectors, and writes the greater value to the
2358/// corresponding bits in the destination.
2359///
2360/// \headerfile <x86intrin.h>
2361///
2362/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2363///
2364/// \param __a
2365/// A 64-bit integer vector containing one of the source operands.
2366/// \param __b
2367/// A 64-bit integer vector containing one of the source operands.
2368/// \returns A 64-bit integer vector containing the comparison results.
2369static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2370_mm_max_pi16(__m64 __a, __m64 __b)
2371{
2372 return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2373}
2374
2375/// Compares each of the corresponding packed 8-bit unsigned integer
2376/// values of the 64-bit integer vectors, and writes the greater value to the
2377/// corresponding bits in the destination.
2378///
2379/// \headerfile <x86intrin.h>
2380///
2381/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2382///
2383/// \param __a
2384/// A 64-bit integer vector containing one of the source operands.
2385/// \param __b
2386/// A 64-bit integer vector containing one of the source operands.
2387/// \returns A 64-bit integer vector containing the comparison results.
2388static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2389_mm_max_pu8(__m64 __a, __m64 __b)
2390{
2391 return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2392}
2393
2394/// Compares each of the corresponding packed 16-bit integer values of
2395/// the 64-bit integer vectors, and writes the lesser value to the
2396/// corresponding bits in the destination.
2397///
2398/// \headerfile <x86intrin.h>
2399///
2400/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2401///
2402/// \param __a
2403/// A 64-bit integer vector containing one of the source operands.
2404/// \param __b
2405/// A 64-bit integer vector containing one of the source operands.
2406/// \returns A 64-bit integer vector containing the comparison results.
2407static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2408_mm_min_pi16(__m64 __a, __m64 __b)
2409{
2410 return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2411}
2412
2413/// Compares each of the corresponding packed 8-bit unsigned integer
2414/// values of the 64-bit integer vectors, and writes the lesser value to the
2415/// corresponding bits in the destination.
2416///
2417/// \headerfile <x86intrin.h>
2418///
2419/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2420///
2421/// \param __a
2422/// A 64-bit integer vector containing one of the source operands.
2423/// \param __b
2424/// A 64-bit integer vector containing one of the source operands.
2425/// \returns A 64-bit integer vector containing the comparison results.
2426static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2427_mm_min_pu8(__m64 __a, __m64 __b)
2428{
2429 return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2430}
2431
2432/// Takes the most significant bit from each 8-bit element in a 64-bit
2433/// integer vector to create an 8-bit mask value. Zero-extends the value to
2434/// 32-bit integer and writes it to the destination.
2435///
2436/// \headerfile <x86intrin.h>
2437///
2438/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2439///
2440/// \param __a
2441/// A 64-bit integer vector containing the values with bits to be extracted.
2442/// \returns The most significant bit from each 8-bit element in \a __a,
2443/// written to bits [7:0].
2444static __inline__ int __DEFAULT_FN_ATTRS_SSE2
2446{
2447 return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2448}
2449
2450/// Multiplies packed 16-bit unsigned integer values and writes the
2451/// high-order 16 bits of each 32-bit product to the corresponding bits in
2452/// the destination.
2453///
2454/// \headerfile <x86intrin.h>
2455///
2456/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2457///
2458/// \param __a
2459/// A 64-bit integer vector containing one of the source operands.
2460/// \param __b
2461/// A 64-bit integer vector containing one of the source operands.
2462/// \returns A 64-bit integer vector containing the products of both operands.
2463static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2465{
2466 return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
2467 (__v8hi)__anyext128(__b)));
2468}
2469
2470/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2471/// destination, as specified by the immediate value operand.
2472///
2473/// \headerfile <x86intrin.h>
2474///
2475/// \code
2476/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2477/// \endcode
2478///
2479/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2480///
2481/// \param a
2482/// A 64-bit integer vector containing the values to be shuffled.
2483/// \param n
2484/// An immediate value containing an 8-bit value specifying which elements to
2485/// copy from \a a. The destinations within the 64-bit destination are
2486/// assigned values as follows: \n
2487/// Bits [1:0] are used to assign values to bits [15:0] in the
2488/// destination. \n
2489/// Bits [3:2] are used to assign values to bits [31:16] in the
2490/// destination. \n
2491/// Bits [5:4] are used to assign values to bits [47:32] in the
2492/// destination. \n
2493/// Bits [7:6] are used to assign values to bits [63:48] in the
2494/// destination. \n
2495/// Bit value assignments: \n
2496/// 00: assigned from bits [15:0] of \a a. \n
2497/// 01: assigned from bits [31:16] of \a a. \n
2498/// 10: assigned from bits [47:32] of \a a. \n
2499/// 11: assigned from bits [63:48] of \a a. \n
2500/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2501/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2502/// <c>[b6, b4, b2, b0]</c>.
2503/// \returns A 64-bit integer vector containing the shuffled values.
2504#define _mm_shuffle_pi16(a, n) \
2505 ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2506 (n) & 0x3, ((n) >> 2) & 0x3, \
2507 ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2508
2509/// Conditionally copies the values from each 8-bit element in the first
2510/// 64-bit integer vector operand to the specified memory location, as
2511/// specified by the most significant bit in the corresponding element in the
2512/// second 64-bit integer vector operand.
2513///
2514/// To minimize caching, the data is flagged as non-temporal
2515/// (unlikely to be used again soon).
2516///
2517/// \headerfile <x86intrin.h>
2518///
2519/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2520///
2521/// \param __d
2522/// A 64-bit integer vector containing the values with elements to be copied.
2523/// \param __n
2524/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2525/// element determines whether the corresponding element in operand \a __d
2526/// is copied. If the most significant bit of a given element is 1, the
2527/// corresponding element in operand \a __d is copied.
2528/// \param __p
2529/// A pointer to a 64-bit memory location that will receive the conditionally
2530/// copied integer values. The address of the memory location does not have
2531/// to be aligned.
2532static __inline__ void __DEFAULT_FN_ATTRS_SSE2
2533_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2534{
2535 // This is complex, because we need to support the case where __p is pointing
2536 // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2537 // write might cause a trap where a 64-bit maskmovq would not. (Memory
2538 // locations not selected by the mask bits might still cause traps.)
2539 __m128i __d128 = __anyext128(__d);
2540 __m128i __n128 = __zext128(__n);
2541 if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2542 ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2543 // If there's a risk of spurious trap due to a 128-bit write, back up the
2544 // pointer by 8 bytes and shift values in registers to match.
2545 __p -= 8;
2546 __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
2547 __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
2548 }
2549
2550 __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2551}
2552
2553/// Computes the rounded averages of the packed unsigned 8-bit integer
2554/// values and writes the averages to the corresponding bits in the
2555/// destination.
2556///
2557/// \headerfile <x86intrin.h>
2558///
2559/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2560///
2561/// \param __a
2562/// A 64-bit integer vector containing one of the source operands.
2563/// \param __b
2564/// A 64-bit integer vector containing one of the source operands.
2565/// \returns A 64-bit integer vector containing the averages of both operands.
2566static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2567_mm_avg_pu8(__m64 __a, __m64 __b)
2568{
2569 return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
2570 (__v16qi)__anyext128(__b)));
2571}
2572
2573/// Computes the rounded averages of the packed unsigned 16-bit integer
2574/// values and writes the averages to the corresponding bits in the
2575/// destination.
2576///
2577/// \headerfile <x86intrin.h>
2578///
2579/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2580///
2581/// \param __a
2582/// A 64-bit integer vector containing one of the source operands.
2583/// \param __b
2584/// A 64-bit integer vector containing one of the source operands.
2585/// \returns A 64-bit integer vector containing the averages of both operands.
2586static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2587_mm_avg_pu16(__m64 __a, __m64 __b)
2588{
2589 return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
2590 (__v8hi)__anyext128(__b)));
2591}
2592
2593/// Subtracts the corresponding 8-bit unsigned integer values of the two
2594/// 64-bit vector operands and computes the absolute value for each of the
2595/// difference. Then sum of the 8 absolute differences is written to the
2596/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2597///
2598/// \headerfile <x86intrin.h>
2599///
2600/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2601///
2602/// \param __a
2603/// A 64-bit integer vector containing one of the source operands.
2604/// \param __b
2605/// A 64-bit integer vector containing one of the source operands.
2606/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2607/// sets of absolute differences between both operands. The upper bits are
2608/// cleared.
2609static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2610_mm_sad_pu8(__m64 __a, __m64 __b)
2611{
2612 return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2613 (__v16qi)__zext128(__b)));
2614}
2615
2616#if defined(__cplusplus)
2617extern "C" {
2618#endif
2619
2620/// Returns the contents of the MXCSR register as a 32-bit unsigned
2621/// integer value.
2622///
2623/// There are several groups of macros associated with this
2624/// intrinsic, including:
2625/// <ul>
2626/// <li>
2627/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2628/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2629/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2630/// _MM_GET_EXCEPTION_STATE().
2631/// </li>
2632/// <li>
2633/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2634/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2635/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2636/// </li>
2637/// <li>
2638/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2639/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2640/// _MM_GET_ROUNDING_MODE().
2641/// </li>
2642/// <li>
2643/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2644/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2645/// </li>
2646/// <li>
2647/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2648/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2649/// _MM_GET_DENORMALS_ZERO_MODE().
2650/// </li>
2651/// </ul>
2652///
2653/// For example, the following expression checks if an overflow exception has
2654/// occurred:
2655/// \code
2656/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2657/// \endcode
2658///
2659/// The following expression gets the current rounding mode:
2660/// \code
2661/// _MM_GET_ROUNDING_MODE()
2662/// \endcode
2663///
2664/// \headerfile <x86intrin.h>
2665///
2666/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2667///
2668/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2669/// register.
2670unsigned int _mm_getcsr(void);
2671
2672/// Sets the MXCSR register with the 32-bit unsigned integer value.
2673///
2674/// There are several groups of macros associated with this intrinsic,
2675/// including:
2676/// <ul>
2677/// <li>
2678/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2679/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2680/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2681/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2682/// </li>
2683/// <li>
2684/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2685/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2686/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2687/// of these macros.
2688/// </li>
2689/// <li>
2690/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2691/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2692/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2693/// </li>
2694/// <li>
2695/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2696/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2697/// one of these macros.
2698/// </li>
2699/// <li>
2700/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2701/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2702/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2703/// </li>
2704/// </ul>
2705///
2706/// For example, the following expression causes subsequent floating-point
2707/// operations to round up:
2708/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2709///
2710/// The following example sets the DAZ and FTZ flags:
2711/// \code
2712/// void setFlags() {
2713/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2714/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2715/// }
2716/// \endcode
2717///
2718/// \headerfile <x86intrin.h>
2719///
2720/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2721///
2722/// \param __i
2723/// A 32-bit unsigned integer value to be written to the MXCSR register.
2724void _mm_setcsr(unsigned int __i);
2725
2726#if defined(__cplusplus)
2727} // extern "C"
2728#endif
2729
2730/// Selects 4 float values from the 128-bit operands of [4 x float], as
2731/// specified by the immediate value operand.
2732///
2733/// \headerfile <x86intrin.h>
2734///
2735/// \code
2736/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2737/// \endcode
2738///
2739/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2740///
2741/// \param a
2742/// A 128-bit vector of [4 x float].
2743/// \param b
2744/// A 128-bit vector of [4 x float].
2745/// \param mask
2746/// An immediate value containing an 8-bit value specifying which elements to
2747/// copy from \a a and \a b. \n
2748/// Bits [3:0] specify the values copied from operand \a a. \n
2749/// Bits [7:4] specify the values copied from operand \a b. \n
2750/// The destinations within the 128-bit destination are assigned values as
2751/// follows: \n
2752/// Bits [1:0] are used to assign values to bits [31:0] in the
2753/// destination. \n
2754/// Bits [3:2] are used to assign values to bits [63:32] in the
2755/// destination. \n
2756/// Bits [5:4] are used to assign values to bits [95:64] in the
2757/// destination. \n
2758/// Bits [7:6] are used to assign values to bits [127:96] in the
2759/// destination. \n
2760/// Bit value assignments: \n
2761/// 00: Bits [31:0] copied from the specified operand. \n
2762/// 01: Bits [63:32] copied from the specified operand. \n
2763/// 10: Bits [95:64] copied from the specified operand. \n
2764/// 11: Bits [127:96] copied from the specified operand. \n
2765/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2766/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2767/// <c>[b6, b4, b2, b0]</c>.
2768/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2769#define _mm_shuffle_ps(a, b, mask) \
2770 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2771 (int)(mask)))
2772
2773/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2774/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2775///
2776/// \headerfile <x86intrin.h>
2777///
2778/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2779///
2780/// \param __a
2781/// A 128-bit vector of [4 x float]. \n
2782/// Bits [95:64] are written to bits [31:0] of the destination. \n
2783/// Bits [127:96] are written to bits [95:64] of the destination.
2784/// \param __b
2785/// A 128-bit vector of [4 x float].
2786/// Bits [95:64] are written to bits [63:32] of the destination. \n
2787/// Bits [127:96] are written to bits [127:96] of the destination.
2788/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2789static __inline__ __m128 __DEFAULT_FN_ATTRS
2790_mm_unpackhi_ps(__m128 __a, __m128 __b)
2791{
2792 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2793}
2794
2795/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2796/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2797///
2798/// \headerfile <x86intrin.h>
2799///
2800/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2801///
2802/// \param __a
2803/// A 128-bit vector of [4 x float]. \n
2804/// Bits [31:0] are written to bits [31:0] of the destination. \n
2805/// Bits [63:32] are written to bits [95:64] of the destination.
2806/// \param __b
2807/// A 128-bit vector of [4 x float]. \n
2808/// Bits [31:0] are written to bits [63:32] of the destination. \n
2809/// Bits [63:32] are written to bits [127:96] of the destination.
2810/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2811static __inline__ __m128 __DEFAULT_FN_ATTRS
2812_mm_unpacklo_ps(__m128 __a, __m128 __b)
2813{
2814 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2815}
2816
2817/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2818/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2819/// 96 bits are set to the upper 96 bits of the first parameter.
2820///
2821/// \headerfile <x86intrin.h>
2822///
2823/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2824/// instruction.
2825///
2826/// \param __a
2827/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2828/// written to the upper 96 bits of the result.
2829/// \param __b
2830/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2831/// written to the lower 32 bits of the result.
2832/// \returns A 128-bit floating-point vector of [4 x float].
2833static __inline__ __m128 __DEFAULT_FN_ATTRS
2834_mm_move_ss(__m128 __a, __m128 __b)
2835{
2836 __a[0] = __b[0];
2837 return __a;
2838}
2839
2840/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2841/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2842/// 64 bits are set to the upper 64 bits of the first parameter.
2843///
2844/// \headerfile <x86intrin.h>
2845///
2846/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2847///
2848/// \param __a
2849/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2850/// written to the upper 64 bits of the result.
2851/// \param __b
2852/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2853/// written to the lower 64 bits of the result.
2854/// \returns A 128-bit floating-point vector of [4 x float].
2855static __inline__ __m128 __DEFAULT_FN_ATTRS
2856_mm_movehl_ps(__m128 __a, __m128 __b)
2857{
2858 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2859}
2860
2861/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2862/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2863/// 64 bits are set to the lower 64 bits of the second parameter.
2864///
2865/// \headerfile <x86intrin.h>
2866///
2867/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2868///
2869/// \param __a
2870/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2871/// written to the lower 64 bits of the result.
2872/// \param __b
2873/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2874/// written to the upper 64 bits of the result.
2875/// \returns A 128-bit floating-point vector of [4 x float].
2876static __inline__ __m128 __DEFAULT_FN_ATTRS
2877_mm_movelh_ps(__m128 __a, __m128 __b)
2878{
2879 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2880}
2881
2882/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2883/// float].
2884///
2885/// \headerfile <x86intrin.h>
2886///
2887/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2888///
2889/// \param __a
2890/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2891/// from the corresponding elements in this operand.
2892/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2893/// values from the operand.
2894static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2896{
2897 return __builtin_convertvector((__v4hi)__a, __v4sf);
2898}
2899
2900/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2901/// 128-bit vector of [4 x float].
2902///
2903/// \headerfile <x86intrin.h>
2904///
2905/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2906///
2907/// \param __a
2908/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2909/// destination are copied from the corresponding elements in this operand.
2910/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2911/// values from the operand.
2912static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2914{
2915 return __builtin_convertvector((__v4hu)__a, __v4sf);
2916}
2917
2918/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2919/// into a 128-bit vector of [4 x float].
2920///
2921/// \headerfile <x86intrin.h>
2922///
2923/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2924///
2925/// \param __a
2926/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2927/// from the corresponding lower 4 elements in this operand.
2928/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2929/// values from the operand.
2930static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2932{
2933 return __builtin_convertvector(
2934 __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2935 0, 1, 2, 3), __v4sf);
2936}
2937
2938/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2939/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2940///
2941/// \headerfile <x86intrin.h>
2942///
2943/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2944///
2945/// \param __a
2946/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2947/// destination are copied from the corresponding lower 4 elements in this
2948/// operand.
2949/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2950/// values from the source operand.
2951static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2953{
2954 return __builtin_convertvector(
2955 __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2956 0, 1, 2, 3), __v4sf);
2957}
2958
2959/// Converts the two 32-bit signed integer values from each 64-bit vector
2960/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2961///
2962/// \headerfile <x86intrin.h>
2963///
2964/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2965///
2966/// \param __a
2967/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2968/// copied from the elements in this operand.
2969/// \param __b
2970/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2971/// copied from the elements in this operand.
2972/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2973/// copied and converted values from the first operand. The upper 64 bits
2974/// contain the copied and converted values from the second operand.
2975static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2977{
2978 return __builtin_convertvector(
2979 __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2980 0, 1, 2, 3), __v4sf);
2981}
2982
2983/// Converts each single-precision floating-point element of a 128-bit
2984/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2985/// packs the results into a 64-bit integer vector of [4 x i16].
2986///
2987/// If the floating-point element is NaN or infinity, or if the
2988/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2989/// it is converted to 0x8000. Otherwise if the floating-point element is
2990/// greater than 0x7FFF, it is converted to 0x7FFF.
2991///
2992/// \headerfile <x86intrin.h>
2993///
2994/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2995///
2996/// \param __a
2997/// A 128-bit floating-point vector of [4 x float].
2998/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2999/// values.
3000static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
3002{
3003 return __trunc64(__builtin_ia32_packssdw128(
3004 (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
3005}
3006
3007/// Converts each single-precision floating-point element of a 128-bit
3008/// floating-point vector of [4 x float] into an 8-bit signed integer, and
3009/// packs the results into the lower 32 bits of a 64-bit integer vector of
3010/// [8 x i8]. The upper 32 bits of the vector are set to 0.
3011///
3012/// If the floating-point element is NaN or infinity, or if the
3013/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
3014/// is converted to 0x80. Otherwise if the floating-point element is greater
3015/// than 0x7F, it is converted to 0x7F.
3016///
3017/// \headerfile <x86intrin.h>
3018///
3019/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3020///
3021/// \param __a
3022/// 128-bit floating-point vector of [4 x float].
3023/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3024/// converted values and the uppper 32 bits are set to zero.
3025static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
3027{
3028 __m64 __b, __c;
3029
3032
3033 return _mm_packs_pi16(__b, __c);
3034}
3035
3036/// Extracts the sign bits from each single-precision floating-point
3037/// element of a 128-bit floating-point vector of [4 x float] and returns the
3038/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3039/// to zero.
3040///
3041/// \headerfile <x86intrin.h>
3042///
3043/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3044///
3045/// \param __a
3046/// A 128-bit floating-point vector of [4 x float].
3047/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3048/// single-precision floating-point element of the parameter. Bits [31:4] are
3049/// set to zero.
3050static __inline__ int __DEFAULT_FN_ATTRS
3052{
3053 return __builtin_ia32_movmskps((__v4sf)__a);
3054}
3055
3056/* Compare */
3057#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
3058#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
3059#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
3060#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
3061#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
3062#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
3063#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
3064#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
3065
3066/// Compares each of the corresponding values of two 128-bit vectors of
3067/// [4 x float], using the operation specified by the immediate integer
3068/// operand.
3069///
3070/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3071/// If either value in a comparison is NaN, comparisons that are ordered
3072/// return false, and comparisons that are unordered return true.
3073///
3074/// \headerfile <x86intrin.h>
3075///
3076/// \code
3077/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3078/// \endcode
3079///
3080/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3081///
3082/// \param a
3083/// A 128-bit vector of [4 x float].
3084/// \param b
3085/// A 128-bit vector of [4 x float].
3086/// \param c
3087/// An immediate integer operand, with bits [4:0] specifying which comparison
3088/// operation to use: \n
3089/// 0x00: Equal (ordered, non-signaling) \n
3090/// 0x01: Less-than (ordered, signaling) \n
3091/// 0x02: Less-than-or-equal (ordered, signaling) \n
3092/// 0x03: Unordered (non-signaling) \n
3093/// 0x04: Not-equal (unordered, non-signaling) \n
3094/// 0x05: Not-less-than (unordered, signaling) \n
3095/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3096/// 0x07: Ordered (non-signaling) \n
3097/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3098#define _mm_cmp_ps(a, b, c) \
3099 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3100
3101/// Compares each of the corresponding scalar values of two 128-bit
3102/// vectors of [4 x float], using the operation specified by the immediate
3103/// integer operand.
3104///
3105/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3106/// If either value in a comparison is NaN, comparisons that are ordered
3107/// return false, and comparisons that are unordered return true.
3108///
3109/// \headerfile <x86intrin.h>
3110///
3111/// \code
3112/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3113/// \endcode
3114///
3115/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3116///
3117/// \param a
3118/// A 128-bit vector of [4 x float].
3119/// \param b
3120/// A 128-bit vector of [4 x float].
3121/// \param c
3122/// An immediate integer operand, with bits [4:0] specifying which comparison
3123/// operation to use: \n
3124/// 0x00: Equal (ordered, non-signaling) \n
3125/// 0x01: Less-than (ordered, signaling) \n
3126/// 0x02: Less-than-or-equal (ordered, signaling) \n
3127/// 0x03: Unordered (non-signaling) \n
3128/// 0x04: Not-equal (unordered, non-signaling) \n
3129/// 0x05: Not-less-than (unordered, signaling) \n
3130/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3131/// 0x07: Ordered (non-signaling) \n
3132/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3133#define _mm_cmp_ss(a, b, c) \
3134 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3135
3136#define _MM_ALIGN16 __attribute__((aligned(16)))
3137
3138#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3139
3140#define _MM_EXCEPT_INVALID (0x0001U)
3141#define _MM_EXCEPT_DENORM (0x0002U)
3142#define _MM_EXCEPT_DIV_ZERO (0x0004U)
3143#define _MM_EXCEPT_OVERFLOW (0x0008U)
3144#define _MM_EXCEPT_UNDERFLOW (0x0010U)
3145#define _MM_EXCEPT_INEXACT (0x0020U)
3146#define _MM_EXCEPT_MASK (0x003fU)
3147
3148#define _MM_MASK_INVALID (0x0080U)
3149#define _MM_MASK_DENORM (0x0100U)
3150#define _MM_MASK_DIV_ZERO (0x0200U)
3151#define _MM_MASK_OVERFLOW (0x0400U)
3152#define _MM_MASK_UNDERFLOW (0x0800U)
3153#define _MM_MASK_INEXACT (0x1000U)
3154#define _MM_MASK_MASK (0x1f80U)
3155
3156#define _MM_ROUND_NEAREST (0x0000U)
3157#define _MM_ROUND_DOWN (0x2000U)
3158#define _MM_ROUND_UP (0x4000U)
3159#define _MM_ROUND_TOWARD_ZERO (0x6000U)
3160#define _MM_ROUND_MASK (0x6000U)
3161
3162#define _MM_FLUSH_ZERO_MASK (0x8000U)
3163#define _MM_FLUSH_ZERO_ON (0x8000U)
3164#define _MM_FLUSH_ZERO_OFF (0x0000U)
3165
3166#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3167#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3168#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3169#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3170
3171#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3172#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3173#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3174#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3175
3176#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3177do { \
3178 __m128 tmp3, tmp2, tmp1, tmp0; \
3179 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3180 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3181 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3182 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3183 (row0) = _mm_movelh_ps(tmp0, tmp2); \
3184 (row1) = _mm_movehl_ps(tmp2, tmp0); \
3185 (row2) = _mm_movelh_ps(tmp1, tmp3); \
3186 (row3) = _mm_movehl_ps(tmp3, tmp1); \
3187} while (0)
3188
3189/* Aliases for compatibility. */
3190#define _m_pextrw _mm_extract_pi16
3191#define _m_pinsrw _mm_insert_pi16
3192#define _m_pmaxsw _mm_max_pi16
3193#define _m_pmaxub _mm_max_pu8
3194#define _m_pminsw _mm_min_pi16
3195#define _m_pminub _mm_min_pu8
3196#define _m_pmovmskb _mm_movemask_pi8
3197#define _m_pmulhuw _mm_mulhi_pu16
3198#define _m_pshufw _mm_shuffle_pi16
3199#define _m_maskmovq _mm_maskmove_si64
3200#define _m_pavgb _mm_avg_pu8
3201#define _m_pavgw _mm_avg_pu16
3202#define _m_psadbw _mm_sad_pu8
3203#define _m_ _mm_
3204
3205#undef __trunc64
3206#undef __zext128
3207#undef __anyext128
3208#undef __zeroupper64
3209#undef __DEFAULT_FN_ATTRS
3210#undef __DEFAULT_FN_ATTRS_SSE2
3211
3212/* Ugly hack for backwards-compatibility (compatible with gcc) */
3213#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3214#include <emmintrin.h>
3215#endif
3216
3217#endif /* __XMMINTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline__ uint32_t uint32_t __y
Definition: arm_acle.h:130
static __inline__ void int __a
Definition: emmintrin.h:4064
struct __storeu_i16 *__P __v
Definition: immintrin.h:472
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition: mmintrin.h:157
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1336
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1193
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1515
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:278
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2427
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:594
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2587
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:243
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:225
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:969
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:546
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2042
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1935
#define __anyext128(x)
Definition: xmmintrin.h:56
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:441
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:572
static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2445
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1493
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1427
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:525
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition: xmmintrin.h:99
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1862
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:790
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1169
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition: xmmintrin.h:463
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2201
#define __zeroupper64(x)
Definition: xmmintrin.h:59
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1915
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:867
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1288
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:620
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2610
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1649
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2952
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:331
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2080
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1312
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:742
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:1120
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2162
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1972
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:184
#define __DEFAULT_FN_ATTRS
Definition: xmmintrin.h:43
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition: xmmintrin.h:423
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:314
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1360
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1145
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1790
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2122
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:121
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:142
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1839
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2877
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition: xmmintrin.h:377
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition: xmmintrin.h:2278
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2259
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1725
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1217
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1405
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:692
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:2027
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1336
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2976
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:3001
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2790
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2101
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:917
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1763
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:500
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:295
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2834
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1953
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2408
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:2143
void _mm_setcsr(unsigned int __i)
Sets the MXCSR register with the 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:481
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:260
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition: xmmintrin.h:768
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition: xmmintrin.h:1742
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1699
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:164
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1583
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1473
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1626
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1537
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:3051
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2856
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:3026
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2370
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1901
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1021
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2567
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:840
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2059
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:942
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:994
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1045
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:668
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2931
#define __trunc64(x)
Definition: xmmintrin.h:51
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1264
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:79
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1999
static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2533
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:817
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2182
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:356
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2913
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2464
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:642
unsigned int _mm_getcsr(void)
Returns the contents of the MXCSR register as a 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2812
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:402
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2895
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1383
#define __zext128(x)
Definition: xmmintrin.h:53
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1241
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1604
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:890
#define __DEFAULT_FN_ATTRS_SSE2
Definition: xmmintrin.h:46
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1072
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:718
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2389
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1096
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1879
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1817
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:206