clang 20.0.0git
smmintrin.h
Go to the documentation of this file.
1/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0.
12
13 NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */
14
15#ifndef NO_WARN_X86_INTRINSICS
16/* This header is distributed to simplify porting x86_64 code that
17 makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
18
19 It is the user's responsibility to determine if the results are
20 acceptable and make additional changes as necessary.
21
22 Note that much code that uses Intel intrinsics can be rewritten in
23 standard C or GNU C extensions, which are more portable and better
24 optimized across multiple targets. */
25#error \
26 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
27#endif
28
29#ifndef SMMINTRIN_H_
30#define SMMINTRIN_H_
31
32#if defined(__powerpc64__) && \
33 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
34
35#include <altivec.h>
36#include <tmmintrin.h>
37
38/* Rounding mode macros. */
39#define _MM_FROUND_TO_NEAREST_INT 0x00
40#define _MM_FROUND_TO_ZERO 0x01
41#define _MM_FROUND_TO_POS_INF 0x02
42#define _MM_FROUND_TO_NEG_INF 0x03
43#define _MM_FROUND_CUR_DIRECTION 0x04
44
45#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
46#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
47#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
48#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
49#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
50#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
51
52#define _MM_FROUND_RAISE_EXC 0x00
53#define _MM_FROUND_NO_EXC 0x08
54
55extern __inline __m128d
56 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57 _mm_round_pd(__m128d __A, int __rounding) {
58 __v2df __r;
59 union {
60 double __fr;
61 long long __fpscr;
62 } __enables_save, __fpscr_save;
63
64 if (__rounding & _MM_FROUND_NO_EXC) {
65 /* Save enabled exceptions, disable all exceptions,
66 and preserve the rounding mode. */
67#ifdef _ARCH_PWR9
68 __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
69 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
70#else
71 __fpscr_save.__fr = __builtin_ppc_mffs();
72 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
73 __fpscr_save.__fpscr &= ~0xf8;
74 __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
75#endif
76 /* Insert an artificial "read/write" reference to the variable
77 read below, to ensure the compiler does not schedule
78 a read/use of the variable before the FPSCR is modified, above.
79 This can be removed if and when GCC PR102783 is fixed.
80 */
81 __asm__("" : "+wa"(__A));
82 }
83
84 switch (__rounding) {
86#ifdef _ARCH_PWR9
87 __fpscr_save.__fr = __builtin_ppc_mffsl();
88#else
89 __fpscr_save.__fr = __builtin_ppc_mffs();
90 __fpscr_save.__fpscr &= 0x70007f0ffL;
91#endif
92 __attribute__((fallthrough));
94 __builtin_ppc_set_fpscr_rn(0b00);
95 /* Insert an artificial "read/write" reference to the variable
96 read below, to ensure the compiler does not schedule
97 a read/use of the variable before the FPSCR is modified, above.
98 This can be removed if and when GCC PR102783 is fixed.
99 */
100 __asm__("" : "+wa"(__A));
101
102 __r = vec_rint((__v2df)__A);
103
104 /* Insert an artificial "read" reference to the variable written
105 above, to ensure the compiler does not schedule the computation
106 of the value after the manipulation of the FPSCR, below.
107 This can be removed if and when GCC PR102783 is fixed.
108 */
109 __asm__("" : : "wa"(__r));
110 __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
111 break;
114 __r = vec_floor((__v2df)__A);
115 break;
118 __r = vec_ceil((__v2df)__A);
119 break;
122 __r = vec_trunc((__v2df)__A);
123 break;
125 __r = vec_rint((__v2df)__A);
126 break;
127 }
128 if (__rounding & _MM_FROUND_NO_EXC) {
129 /* Insert an artificial "read" reference to the variable written
130 above, to ensure the compiler does not schedule the computation
131 of the value after the manipulation of the FPSCR, below.
132 This can be removed if and when GCC PR102783 is fixed.
133 */
134 __asm__("" : : "wa"(__r));
135 /* Restore enabled exceptions. */
136#ifdef _ARCH_PWR9
137 __fpscr_save.__fr = __builtin_ppc_mffsl();
138#else
139 __fpscr_save.__fr = __builtin_ppc_mffs();
140 __fpscr_save.__fpscr &= 0x70007f0ffL;
141#endif
142 __fpscr_save.__fpscr |= __enables_save.__fpscr;
143 __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
144 }
145 return (__m128d)__r;
146}
147
148extern __inline __m128d
149 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150 _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
151 __B = _mm_round_pd(__B, __rounding);
152 __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
153 return (__m128d)__r;
154}
155
156extern __inline __m128
157 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158 _mm_round_ps(__m128 __A, int __rounding) {
159 __v4sf __r;
160 union {
161 double __fr;
162 long long __fpscr;
163 } __enables_save, __fpscr_save;
164
165 if (__rounding & _MM_FROUND_NO_EXC) {
166 /* Save enabled exceptions, disable all exceptions,
167 and preserve the rounding mode. */
168#ifdef _ARCH_PWR9
169 __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
170 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
171#else
172 __fpscr_save.__fr = __builtin_ppc_mffs();
173 __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
174 __fpscr_save.__fpscr &= ~0xf8;
175 __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
176#endif
177 /* Insert an artificial "read/write" reference to the variable
178 read below, to ensure the compiler does not schedule
179 a read/use of the variable before the FPSCR is modified, above.
180 This can be removed if and when GCC PR102783 is fixed.
181 */
182 __asm__("" : "+wa"(__A));
183 }
184
185 switch (__rounding) {
187#ifdef _ARCH_PWR9
188 __fpscr_save.__fr = __builtin_ppc_mffsl();
189#else
190 __fpscr_save.__fr = __builtin_ppc_mffs();
191 __fpscr_save.__fpscr &= 0x70007f0ffL;
192#endif
193 __attribute__((fallthrough));
195 __builtin_ppc_set_fpscr_rn(0b00);
196 /* Insert an artificial "read/write" reference to the variable
197 read below, to ensure the compiler does not schedule
198 a read/use of the variable before the FPSCR is modified, above.
199 This can be removed if and when GCC PR102783 is fixed.
200 */
201 __asm__("" : "+wa"(__A));
202
203 __r = vec_rint((__v4sf)__A);
204
205 /* Insert an artificial "read" reference to the variable written
206 above, to ensure the compiler does not schedule the computation
207 of the value after the manipulation of the FPSCR, below.
208 This can be removed if and when GCC PR102783 is fixed.
209 */
210 __asm__("" : : "wa"(__r));
211 __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
212 break;
215 __r = vec_floor((__v4sf)__A);
216 break;
219 __r = vec_ceil((__v4sf)__A);
220 break;
223 __r = vec_trunc((__v4sf)__A);
224 break;
226 __r = vec_rint((__v4sf)__A);
227 break;
228 }
229 if (__rounding & _MM_FROUND_NO_EXC) {
230 /* Insert an artificial "read" reference to the variable written
231 above, to ensure the compiler does not schedule the computation
232 of the value after the manipulation of the FPSCR, below.
233 This can be removed if and when GCC PR102783 is fixed.
234 */
235 __asm__("" : : "wa"(__r));
236 /* Restore enabled exceptions. */
237#ifdef _ARCH_PWR9
238 __fpscr_save.__fr = __builtin_ppc_mffsl();
239#else
240 __fpscr_save.__fr = __builtin_ppc_mffs();
241 __fpscr_save.__fpscr &= 0x70007f0ffL;
242#endif
243 __fpscr_save.__fpscr |= __enables_save.__fpscr;
244 __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
245 }
246 return (__m128)__r;
247}
248
249extern __inline __m128
250 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251 _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
252 __B = _mm_round_ps(__B, __rounding);
253 __v4sf __r = (__v4sf)__A;
254 __r[0] = ((__v4sf)__B)[0];
255 return (__m128)__r;
256}
257
258#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
259#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
260
261#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
262#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
263
264#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
265#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
266
267#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
268#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
269
270extern __inline __m128i
271 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272 _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
273 __v16qi __result = (__v16qi)__A;
274
275 __result[__N & 0xf] = __D;
276
277 return (__m128i)__result;
278}
279
280extern __inline __m128i
281 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282 _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
283 __v4si __result = (__v4si)__A;
284
285 __result[__N & 3] = __D;
286
287 return (__m128i)__result;
288}
289
290extern __inline __m128i
291 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292 _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
293 __v2di __result = (__v2di)__A;
294
295 __result[__N & 1] = __D;
296
297 return (__m128i)__result;
298}
299
300extern __inline int
301 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _mm_extract_epi8(__m128i __X, const int __N) {
303 return (unsigned char)((__v16qi)__X)[__N & 15];
304}
305
306extern __inline int
307 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308 _mm_extract_epi32(__m128i __X, const int __N) {
309 return ((__v4si)__X)[__N & 3];
310}
311
312extern __inline int
313 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314 _mm_extract_epi64(__m128i __X, const int __N) {
315 return ((__v2di)__X)[__N & 1];
316}
317
318extern __inline int
319 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320 _mm_extract_ps(__m128 __X, const int __N) {
321 return ((__v4si)__X)[__N & 3];
322}
323
324#ifdef _ARCH_PWR8
325extern __inline __m128i
326 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
328 __v16qu __charmask = vec_splats((unsigned char)__imm8);
329 __charmask = vec_gb(__charmask);
330 __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
331#ifdef __BIG_ENDIAN__
332 __shortmask = vec_reve(__shortmask);
333#endif
334 return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
335}
336#endif
337
338extern __inline __m128i
339 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340 _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
341#ifdef _ARCH_PWR10
342 return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
343#else
344 const __v16qu __seven = vec_splats((unsigned char)0x07);
345 __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
346 return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
347#endif
348}
349
350extern __inline __m128
351 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
352 _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
353 __v16qu __pcv[] = {
354 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
355 {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
356 {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
357 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
358 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
359 {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
360 {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
361 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
362 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
363 {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
364 {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
365 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
366 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
367 {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
368 {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
369 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
370 };
371 __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
372 return (__m128)__r;
373}
374
375extern __inline __m128
376 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
377 _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
378#ifdef _ARCH_PWR10
379 return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
380#else
381 const __v4si __zero = {0};
382 const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
383 return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
384#endif
385}
386
387extern __inline __m128d
388 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
390 __v16qu __pcv[] = {
391 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
392 {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
393 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
394 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
395 __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
396 return (__m128d)__r;
397}
398
399#ifdef _ARCH_PWR8
400extern __inline __m128d
401 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
403#ifdef _ARCH_PWR10
404 return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
405#else
406 const __v2di __zero = {0};
407 const __vector __bool long long __boolmask =
408 vec_cmplt((__v2di)__mask, __zero);
409 return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
410#endif
411}
412#endif
413
414extern __inline int
415 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416 _mm_testz_si128(__m128i __A, __m128i __B) {
417 /* Note: This implementation does NOT set "zero" or "carry" flags. */
418 const __v16qu __zero = {0};
419 return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
420}
421
422extern __inline int
423 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424 _mm_testc_si128(__m128i __A, __m128i __B) {
425 /* Note: This implementation does NOT set "zero" or "carry" flags. */
426 const __v16qu __zero = {0};
427 const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
428 return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
429}
430
431extern __inline int
432 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433 _mm_testnzc_si128(__m128i __A, __m128i __B) {
434 /* Note: This implementation does NOT set "zero" or "carry" flags. */
435 return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
436}
437
438#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
439
440#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
441
442#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
443
444#ifdef _ARCH_PWR8
445extern __inline __m128i
446 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
448 return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
449}
450#endif
451
452extern __inline __m128i
453 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454 _mm_min_epi8(__m128i __X, __m128i __Y) {
455 return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
456}
457
458extern __inline __m128i
459 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460 _mm_min_epu16(__m128i __X, __m128i __Y) {
461 return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
462}
463
464extern __inline __m128i
465 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466 _mm_min_epi32(__m128i __X, __m128i __Y) {
467 return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
468}
469
470extern __inline __m128i
471 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472 _mm_min_epu32(__m128i __X, __m128i __Y) {
473 return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
474}
475
476extern __inline __m128i
477 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478 _mm_max_epi8(__m128i __X, __m128i __Y) {
479 return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
480}
481
482extern __inline __m128i
483 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484 _mm_max_epu16(__m128i __X, __m128i __Y) {
485 return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
486}
487
488extern __inline __m128i
489 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490 _mm_max_epi32(__m128i __X, __m128i __Y) {
491 return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
492}
493
494extern __inline __m128i
495 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496 _mm_max_epu32(__m128i __X, __m128i __Y) {
497 return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
498}
499
500extern __inline __m128i
501 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502 _mm_mullo_epi32(__m128i __X, __m128i __Y) {
503 return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
504}
505
506#ifdef _ARCH_PWR8
507extern __inline __m128i
508 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509 _mm_mul_epi32(__m128i __X, __m128i __Y) {
510 return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
511}
512#endif
513
514extern __inline __m128i
515 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516 _mm_cvtepi8_epi16(__m128i __A) {
517 return (__m128i)vec_unpackh((__v16qi)__A);
518}
519
520extern __inline __m128i
521 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522 _mm_cvtepi8_epi32(__m128i __A) {
523 __A = (__m128i)vec_unpackh((__v16qi)__A);
524 return (__m128i)vec_unpackh((__v8hi)__A);
525}
526
527#ifdef _ARCH_PWR8
528extern __inline __m128i
529 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
530 _mm_cvtepi8_epi64(__m128i __A) {
531 __A = (__m128i)vec_unpackh((__v16qi)__A);
532 __A = (__m128i)vec_unpackh((__v8hi)__A);
533 return (__m128i)vec_unpackh((__v4si)__A);
534}
535#endif
536
537extern __inline __m128i
538 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539 _mm_cvtepi16_epi32(__m128i __A) {
540 return (__m128i)vec_unpackh((__v8hi)__A);
541}
542
543#ifdef _ARCH_PWR8
544extern __inline __m128i
545 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546 _mm_cvtepi16_epi64(__m128i __A) {
547 __A = (__m128i)vec_unpackh((__v8hi)__A);
548 return (__m128i)vec_unpackh((__v4si)__A);
549}
550#endif
551
552#ifdef _ARCH_PWR8
553extern __inline __m128i
554 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
555 _mm_cvtepi32_epi64(__m128i __A) {
556 return (__m128i)vec_unpackh((__v4si)__A);
557}
558#endif
559
560extern __inline __m128i
561 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562 _mm_cvtepu8_epi16(__m128i __A) {
563 const __v16qu __zero = {0};
564#ifdef __LITTLE_ENDIAN__
565 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
566#else /* __BIG_ENDIAN__. */
567 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
568#endif /* __BIG_ENDIAN__. */
569 return __A;
570}
571
572extern __inline __m128i
573 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574 _mm_cvtepu8_epi32(__m128i __A) {
575 const __v16qu __zero = {0};
576#ifdef __LITTLE_ENDIAN__
577 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
578 __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
579#else /* __BIG_ENDIAN__. */
580 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
581 __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
582#endif /* __BIG_ENDIAN__. */
583 return __A;
584}
585
586extern __inline __m128i
587 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588 _mm_cvtepu8_epi64(__m128i __A) {
589 const __v16qu __zero = {0};
590#ifdef __LITTLE_ENDIAN__
591 __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
592 __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
593 __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
594#else /* __BIG_ENDIAN__. */
595 __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
596 __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
597 __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
598#endif /* __BIG_ENDIAN__. */
599 return __A;
600}
601
602extern __inline __m128i
603 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604 _mm_cvtepu16_epi32(__m128i __A) {
605 const __v8hu __zero = {0};
606#ifdef __LITTLE_ENDIAN__
607 __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
608#else /* __BIG_ENDIAN__. */
609 __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
610#endif /* __BIG_ENDIAN__. */
611 return __A;
612}
613
614extern __inline __m128i
615 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _mm_cvtepu16_epi64(__m128i __A) {
617 const __v8hu __zero = {0};
618#ifdef __LITTLE_ENDIAN__
619 __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
620 __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
621#else /* __BIG_ENDIAN__. */
622 __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
623 __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
624#endif /* __BIG_ENDIAN__. */
625 return __A;
626}
627
628extern __inline __m128i
629 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630 _mm_cvtepu32_epi64(__m128i __A) {
631 const __v4su __zero = {0};
632#ifdef __LITTLE_ENDIAN__
633 __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
634#else /* __BIG_ENDIAN__. */
635 __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
636#endif /* __BIG_ENDIAN__. */
637 return __A;
638}
639
640/* Return horizontal packed word minimum and its index in bits [15:0]
641 and bits [18:16] respectively. */
642extern __inline __m128i
643 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644 _mm_minpos_epu16(__m128i __A) {
645 union __u {
646 __m128i __m;
647 __v8hu __uh;
648 };
649 union __u __u = {.__m = __A}, __r = {.__m = {0}};
650 unsigned short __ridx = 0;
651 unsigned short __rmin = __u.__uh[__ridx];
652 unsigned long __i;
653 for (__i = 1; __i < 8; __i++) {
654 if (__u.__uh[__i] < __rmin) {
655 __rmin = __u.__uh[__i];
656 __ridx = __i;
657 }
658 }
659 __r.__uh[0] = __rmin;
660 __r.__uh[1] = __ridx;
661 return __r.__m;
662}
663
664extern __inline __m128i
665 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666 _mm_packus_epi32(__m128i __X, __m128i __Y) {
667 return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
668}
669
670#ifdef _ARCH_PWR8
671extern __inline __m128i
672 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673 _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
674 return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
675}
676#endif
677
678#else
679#include_next <smmintrin.h>
680#endif /* defined(__powerpc64__) && \
681 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
682
683#endif /* SMMINTRIN_H_ */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10527
static __inline__ vector float __ATTRS_o_ai vec_ceil(vector float __a)
Definition: altivec.h:1659
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:6263
static __ATTRS_o_ai vector bool char vec_reve(vector bool char __a)
Definition: altivec.h:17528
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737
static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:14802
static __inline__ vector signed char __ATTRS_o_ai vec_mul(vector signed char __a, vector signed char __b)
Definition: altivec.h:6205
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882
static __inline__ vector float __ATTRS_o_ai vec_floor(vector float __a)
Definition: altivec.h:4026
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5091
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4838
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6729
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7844
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5742
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:12642
static __inline__ vector float __ATTRS_o_ai vec_trunc(vector float __a)
Definition: altivec.h:12597
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
#define _MM_FROUND_TO_POS_INF
Definition: avx512fintrin.h:47
#define _MM_FROUND_TO_NEG_INF
Definition: avx512fintrin.h:46
#define _MM_FROUND_TO_NEAREST_INT
Definition: avx512fintrin.h:45
#define _MM_FROUND_TO_ZERO
Definition: avx512fintrin.h:48
#define _MM_FROUND_CUR_DIRECTION
Definition: avx512fintrin.h:49
static __inline__ void short __D
Definition: immintrin.h:468
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition: smmintrin.h:442
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:708
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V)
Zero-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1364
#define _mm_round_ps(X, M)
Rounds each element of the 128-bit vector of [4 x float] to an integer value according to the roundin...
Definition: smmintrin.h:245
#define _mm_blend_epi16(V1, V2, M)
Returns a 128-bit vector of [8 x i16] where the values are selected from either of the first or secon...
Definition: smmintrin.h:527
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V)
Zero-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1344
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, __m128i __V2)
Multiples corresponding elements of two 128-bit vectors of [4 x i32] and returns the lower 32 bits of...
Definition: smmintrin.h:545
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:672
#define _mm_blend_ps(V1, V2, M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition: smmintrin.h:418
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:798
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V)
Sign-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1249
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition: smmintrin.h:469
#define _mm_extract_epi64(X, N)
Extracts a 64-bit element from the 128-bit integer vector of [2 x i64], using the immediate value par...
Definition: smmintrin.h:1083
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V)
Zero-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1436
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V)
Zero-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1418
#define _mm_insert_epi32(X, I, N)
Constructs a 128-bit vector of [4 x i32] by first making a copy of the 128-bit integer vector paramet...
Definition: smmintrin.h:960
#define _mm_round_ss(X, Y, M)
Copies three upper elements of the first 128-bit vector operand to the corresponding three upper elem...
Definition: smmintrin.h:286
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors to determine if the v...
Definition: smmintrin.h:2323
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, __m128i __V2)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: smmintrin.h:1460
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:744
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V)
Sign-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1307
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V)
Sign-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1269
#define _mm_round_pd(X, M)
Rounds each element of the 128-bit vector of [2 x double] to an integer value according to the roundi...
Definition: smmintrin.h:320
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:690
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:726
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors for equality.
Definition: smmintrin.h:1208
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are neither all zeros nor all ones.
Definition: smmintrin.h:1134
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V)
Sign-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1289
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V)
Zero-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1400
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:762
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all ones.
Definition: smmintrin.h:1116
#define _mm_round_sd(X, Y, M)
Copies the upper element of the first 128-bit vector operand to the corresponding upper element of th...
Definition: smmintrin.h:361
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V)
Zero-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1382
#define _mm_extract_epi8(X, N)
Extracts an 8-bit element from the 128-bit integer vector of [16 x i8], using the immediate value par...
Definition: smmintrin.h:1034
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:780
#define _mm_blend_pd(V1, V2, M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition: smmintrin.h:390
#define _mm_extract_ps(X, N)
Extracts a 32-bit integer from a 128-bit vector of [4 x float] and returns it, using the immediate va...
Definition: smmintrin.h:868
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V)
Sign-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1227
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, __m128i __V2)
Multiplies corresponding even-indexed elements of two 128-bit vectors of [4 x i32] and returns a 128-...
Definition: smmintrin.h:564
#define _mm_insert_epi8(X, I, N)
Constructs a 128-bit vector of [16 x i8] by first making a copy of the 128-bit integer vector paramet...
Definition: smmintrin.h:928
#define _mm_extract_epi32(X, N)
Extracts a 32-bit element from the 128-bit integer vector of [4 x i32], using the immediate value par...
Definition: smmintrin.h:1060
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V)
Finds the minimum unsigned 16-bit element in the input 128-bit vector of [8 x u16] and returns it and...
Definition: smmintrin.h:1518
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V)
Sign-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1325
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all zeros.
Definition: smmintrin.h:1099
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M)
Returns a 128-bit vector of [16 x i8] where the values are selected from either of the first or secon...
Definition: smmintrin.h:496
#define _MM_FROUND_NO_EXC
Definition: smmintrin.h:38