clang 20.0.0git
pmmintrin.h
Go to the documentation of this file.
1/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __PMMINTRIN_H
11#define __PMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <emmintrin.h>
18
19/* Define the default attributes for the functions in this file. */
20#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
21#define __DEFAULT_FN_ATTRS \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("sse3,no-evex512"), __min_vector_width__(128)))
24#else
25#define __DEFAULT_FN_ATTRS \
26 __attribute__((__always_inline__, __nodebug__, __target__("sse3"), \
27 __min_vector_width__(128)))
28#endif
29
30/// Loads data from an unaligned memory location to elements in a 128-bit
31/// vector.
32///
33/// If the address of the data is not 16-byte aligned, the instruction may
34/// read two adjacent aligned blocks of memory to retrieve the requested
35/// data.
36///
37/// \headerfile <x86intrin.h>
38///
39/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
40///
41/// \param __p
42/// A pointer to a 128-bit integer vector containing integer values.
43/// \returns A 128-bit vector containing the moved values.
44static __inline__ __m128i __DEFAULT_FN_ATTRS
45_mm_lddqu_si128(__m128i_u const *__p)
46{
47 return (__m128i)__builtin_ia32_lddqu((char const *)__p);
48}
49
50/// Adds the even-indexed values and subtracts the odd-indexed values of
51/// two 128-bit vectors of [4 x float].
52///
53/// \headerfile <x86intrin.h>
54///
55/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
56///
57/// \param __a
58/// A 128-bit vector of [4 x float] containing the left source operand.
59/// \param __b
60/// A 128-bit vector of [4 x float] containing the right source operand.
61/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
62/// differences of both operands.
63static __inline__ __m128 __DEFAULT_FN_ATTRS
64_mm_addsub_ps(__m128 __a, __m128 __b)
65{
66 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
67}
68
69/// Horizontally adds the adjacent pairs of values contained in two
70/// 128-bit vectors of [4 x float].
71///
72/// \headerfile <x86intrin.h>
73///
74/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
75///
76/// \param __a
77/// A 128-bit vector of [4 x float] containing one of the source operands.
78/// The horizontal sums of the values are stored in the lower bits of the
79/// destination.
80/// \param __b
81/// A 128-bit vector of [4 x float] containing one of the source operands.
82/// The horizontal sums of the values are stored in the upper bits of the
83/// destination.
84/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
85/// both operands.
86static __inline__ __m128 __DEFAULT_FN_ATTRS
87_mm_hadd_ps(__m128 __a, __m128 __b)
88{
89 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
90}
91
92/// Horizontally subtracts the adjacent pairs of values contained in two
93/// 128-bit vectors of [4 x float].
94///
95/// \headerfile <x86intrin.h>
96///
97/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
98///
99/// \param __a
100/// A 128-bit vector of [4 x float] containing one of the source operands.
101/// The horizontal differences between the values are stored in the lower
102/// bits of the destination.
103/// \param __b
104/// A 128-bit vector of [4 x float] containing one of the source operands.
105/// The horizontal differences between the values are stored in the upper
106/// bits of the destination.
107/// \returns A 128-bit vector of [4 x float] containing the horizontal
108/// differences of both operands.
109static __inline__ __m128 __DEFAULT_FN_ATTRS
110_mm_hsub_ps(__m128 __a, __m128 __b)
111{
112 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
113}
114
115/// Moves and duplicates odd-indexed values from a 128-bit vector
116/// of [4 x float] to float values stored in a 128-bit vector of
117/// [4 x float].
118///
119/// \headerfile <x86intrin.h>
120///
121/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
122///
123/// \param __a
124/// A 128-bit vector of [4 x float]. \n
125/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
126/// the destination. \n
127/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
128/// destination.
129/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
130/// values.
131static __inline__ __m128 __DEFAULT_FN_ATTRS
133{
134 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
135}
136
137/// Duplicates even-indexed values from a 128-bit vector of
138/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
139///
140/// \headerfile <x86intrin.h>
141///
142/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
143///
144/// \param __a
145/// A 128-bit vector of [4 x float] \n
146/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
147/// the destination. \n
148/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
149/// destination.
150/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
151/// values.
152static __inline__ __m128 __DEFAULT_FN_ATTRS
154{
155 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
156}
157
158/// Adds the even-indexed values and subtracts the odd-indexed values of
159/// two 128-bit vectors of [2 x double].
160///
161/// \headerfile <x86intrin.h>
162///
163/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
164///
165/// \param __a
166/// A 128-bit vector of [2 x double] containing the left source operand.
167/// \param __b
168/// A 128-bit vector of [2 x double] containing the right source operand.
169/// \returns A 128-bit vector of [2 x double] containing the alternating sums
170/// and differences of both operands.
171static __inline__ __m128d __DEFAULT_FN_ATTRS
172_mm_addsub_pd(__m128d __a, __m128d __b)
173{
174 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
175}
176
177/// Horizontally adds the pairs of values contained in two 128-bit
178/// vectors of [2 x double].
179///
180/// \headerfile <x86intrin.h>
181///
182/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
183///
184/// \param __a
185/// A 128-bit vector of [2 x double] containing one of the source operands.
186/// The horizontal sum of the values is stored in the lower bits of the
187/// destination.
188/// \param __b
189/// A 128-bit vector of [2 x double] containing one of the source operands.
190/// The horizontal sum of the values is stored in the upper bits of the
191/// destination.
192/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
193/// both operands.
194static __inline__ __m128d __DEFAULT_FN_ATTRS
195_mm_hadd_pd(__m128d __a, __m128d __b)
196{
197 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
198}
199
200/// Horizontally subtracts the pairs of values contained in two 128-bit
201/// vectors of [2 x double].
202///
203/// \headerfile <x86intrin.h>
204///
205/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
206///
207/// \param __a
208/// A 128-bit vector of [2 x double] containing one of the source operands.
209/// The horizontal difference of the values is stored in the lower bits of
210/// the destination.
211/// \param __b
212/// A 128-bit vector of [2 x double] containing one of the source operands.
213/// The horizontal difference of the values is stored in the upper bits of
214/// the destination.
215/// \returns A 128-bit vector of [2 x double] containing the horizontal
216/// differences of both operands.
217static __inline__ __m128d __DEFAULT_FN_ATTRS
218_mm_hsub_pd(__m128d __a, __m128d __b)
219{
220 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
221}
222
223/// Moves and duplicates one double-precision value to double-precision
224/// values stored in a 128-bit vector of [2 x double].
225///
226/// \headerfile <x86intrin.h>
227///
228/// \code
229/// __m128d _mm_loaddup_pd(double const *dp);
230/// \endcode
231///
232/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
233///
234/// \param dp
235/// A pointer to a double-precision value to be moved and duplicated.
236/// \returns A 128-bit vector of [2 x double] containing the moved and
237/// duplicated values.
238#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
239
240/// Moves and duplicates the double-precision value in the lower bits of
241/// a 128-bit vector of [2 x double] to double-precision values stored in a
242/// 128-bit vector of [2 x double].
243///
244/// \headerfile <x86intrin.h>
245///
246/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
247///
248/// \param __a
249/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
250/// [127:64] and [63:0] of the destination.
251/// \returns A 128-bit vector of [2 x double] containing the moved and
252/// duplicated values.
253static __inline__ __m128d __DEFAULT_FN_ATTRS
255{
256 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
257}
258
259/// Establishes a linear address memory range to be monitored and puts
260/// the processor in the monitor event pending state. Data stored in the
261/// monitored address range causes the processor to exit the pending state.
262///
263/// The \c MONITOR instruction can be used in kernel mode, and in other modes
264/// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
265///
266/// \headerfile <x86intrin.h>
267///
268/// This intrinsic corresponds to the \c MONITOR instruction.
269///
270/// \param __p
271/// The memory range to be monitored. The size of the range is determined by
272/// CPUID function 0000_0005h.
273/// \param __extensions
274/// Optional extensions for the monitoring state.
275/// \param __hints
276/// Optional hints for the monitoring state.
277static __inline__ void __DEFAULT_FN_ATTRS
278_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
279{
280 __builtin_ia32_monitor(__p, __extensions, __hints);
281}
282
283/// Used with the \c MONITOR instruction to wait while the processor is in
284/// the monitor event pending state. Data stored in the monitored address
285/// range, or an interrupt, causes the processor to exit the pending state.
286///
287/// The \c MWAIT instruction can be used in kernel mode, and in other modes if
288/// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
289///
290/// \headerfile <x86intrin.h>
291///
292/// This intrinsic corresponds to the \c MWAIT instruction.
293///
294/// \param __extensions
295/// Optional extensions for the monitoring state, which can vary by
296/// processor.
297/// \param __hints
298/// Optional hints for the monitoring state, which can vary by processor.
299static __inline__ void __DEFAULT_FN_ATTRS
300_mm_mwait(unsigned __extensions, unsigned __hints)
301{
302 __builtin_ia32_mwait(__extensions, __hints);
303}
304
305#undef __DEFAULT_FN_ATTRS
306
307#endif /* __PMMINTRIN_H */
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline__ void int __a
Definition: emmintrin.h:4064
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_hadd_pd(__m128d __a, __m128d __b)
Horizontally adds the pairs of values contained in two 128-bit vectors of [2 x double].
Definition: pmmintrin.h:195
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_movedup_pd(__m128d __a)
Moves and duplicates the double-precision value in the lower bits of a 128-bit vector of [2 x double]...
Definition: pmmintrin.h:254
#define __DEFAULT_FN_ATTRS
Definition: pmmintrin.h:25
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_hadd_ps(__m128 __a, __m128 __b)
Horizontally adds the adjacent pairs of values contained in two 128-bit vectors of [4 x float].
Definition: pmmintrin.h:87
static __inline__ void __DEFAULT_FN_ATTRS _mm_mwait(unsigned __extensions, unsigned __hints)
Used with the MONITOR instruction to wait while the processor is in the monitor event pending state.
Definition: pmmintrin.h:300
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_addsub_pd(__m128d __a, __m128d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 128-bit vectors of [2 x doub...
Definition: pmmintrin.h:172
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_hsub_pd(__m128d __a, __m128d __b)
Horizontally subtracts the pairs of values contained in two 128-bit vectors of [2 x double].
Definition: pmmintrin.h:218
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehdup_ps(__m128 __a)
Moves and duplicates odd-indexed values from a 128-bit vector of [4 x float] to float values stored i...
Definition: pmmintrin.h:132
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_moveldup_ps(__m128 __a)
Duplicates even-indexed values from a 128-bit vector of [4 x float] to float values stored in a 128-b...
Definition: pmmintrin.h:153
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_addsub_ps(__m128 __a, __m128 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 128-bit vectors of [4 x floa...
Definition: pmmintrin.h:64
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_hsub_ps(__m128 __a, __m128 __b)
Horizontally subtracts the adjacent pairs of values contained in two 128-bit vectors of [4 x float].
Definition: pmmintrin.h:110
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_lddqu_si128(__m128i_u const *__p)
Loads data from an unaligned memory location to elements in a 128-bit vector.
Definition: pmmintrin.h:45
static __inline__ void __DEFAULT_FN_ATTRS _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
Establishes a linear address memory range to be monitored and puts the processor in the monitor event...
Definition: pmmintrin.h:278