clang 20.0.0git
avxvnniint16intrin.h
Go to the documentation of this file.
1/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error \
12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13#endif // __IMMINTRIN_H
14
15#ifndef __AVXVNNIINT16INTRIN_H
16#define __AVXVNNIINT16INTRIN_H
17
18/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
19/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
20/// signed 16-bit results. Sum these 2 results with the corresponding
21/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
22///
23/// \headerfile <immintrin.h>
24///
25/// \code
26/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
27/// \endcode
28///
29/// This intrinsic corresponds to the \c VPDPWSUD instruction.
30///
31/// \param __W
32/// A 128-bit vector of [4 x int].
33/// \param __A
34/// A 128-bit vector of [8 x short].
35/// \param __B
36/// A 128-bit vector of [8 x unsigned short].
37/// \returns
38/// A 128-bit vector of [4 x int].
39///
40/// \code{.operation}
41/// FOR j := 0 to 3
42/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
43/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
44/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
45/// ENDFOR
46/// dst[MAX:128] := 0
47/// \endcode
48#define _mm_dpwsud_epi32(__W, __A, __B) \
49 ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \
50 (__v4si)(__B)))
51
52/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
53/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
54/// signed 16-bit results. Sum these 2 results with the corresponding
55/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
56///
57/// \headerfile <immintrin.h>
58///
59/// \code
60/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
61/// \endcode
62///
63/// This intrinsic corresponds to the \c VPDPWSUD instruction.
64///
65/// \param __W
66/// A 256-bit vector of [8 x int].
67/// \param __A
68/// A 256-bit vector of [16 x short].
69/// \param __B
70/// A 256-bit vector of [16 x unsigned short].
71/// \returns
72/// A 256-bit vector of [8 x int].
73///
74/// \code{.operation}
75/// FOR j := 0 to 7
76/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
77/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
78/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
79/// ENDFOR
80/// dst[MAX:256] := 0
81/// \endcode
82#define _mm256_dpwsud_epi32(__W, __A, __B) \
83 ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \
84 (__v8si)(__B)))
85
86/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
87/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
88/// signed 16-bit results. Sum these 2 results with the corresponding
89/// 32-bit integer in \a __W with signed saturation, and store the packed
90/// 32-bit results in \a dst.
91///
92/// \headerfile <immintrin.h>
93///
94/// \code
95/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
96/// \endcode
97///
98/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
99///
100/// \param __W
101/// A 128-bit vector of [4 x int].
102/// \param __A
103/// A 128-bit vector of [8 x short].
104/// \param __B
105/// A 128-bit vector of [8 x unsigned short].
106/// \returns
107/// A 128-bit vector of [4 x int].
108///
109/// \code{.operation}
110/// FOR j := 0 to 3
111/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
112/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
113/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
114/// ENDFOR
115/// dst[MAX:128] := 0
116/// \endcode
117#define _mm_dpwsuds_epi32(__W, __A, __B) \
118 ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \
119 (__v4si)(__B)))
120
121/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
122/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
123/// signed 16-bit results. Sum these 2 results with the corresponding
124/// 32-bit integer in \a __W with signed saturation, and store the packed
125/// 32-bit results in \a dst.
126///
127/// \headerfile <immintrin.h>
128///
129/// \code
130/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
131/// \endcode
132///
133/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
134///
135/// \param __W
136/// A 256-bit vector of [8 x int].
137/// \param __A
138/// A 256-bit vector of [16 x short].
139/// \param __B
140/// A 256-bit vector of [16 x unsigned short].
141/// \returns
142/// A 256-bit vector of [8 x int].
143///
144/// \code{.operation}
145/// FOR j := 0 to 7
146/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
147/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
148/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
149/// ENDFOR
150/// dst[MAX:256] := 0
151/// \endcode
152#define _mm256_dpwsuds_epi32(__W, __A, __B) \
153 ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \
154 (__v8si)(__B)))
155
156/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
157/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
158/// signed 16-bit results. Sum these 2 results with the corresponding
159/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
160///
161/// \headerfile <immintrin.h>
162///
163/// \code
164/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
165/// \endcode
166///
167/// This intrinsic corresponds to the \c VPDPWUSD instruction.
168///
169/// \param __W
170/// A 128-bit vector of [4 x int].
171/// \param __A
172/// A 128-bit vector of [8 x unsigned short].
173/// \param __B
174/// A 128-bit vector of [8 x short].
175/// \returns
176/// A 128-bit vector of [4 x int].
177///
178/// \code{.operation}
179/// FOR j := 0 to 3
180/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
181/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
182/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
183/// ENDFOR
184/// dst[MAX:128] := 0
185/// \endcode
186#define _mm_dpwusd_epi32(__W, __A, __B) \
187 ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \
188 (__v4si)(__B)))
189
190/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
191/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
192/// signed 16-bit results. Sum these 2 results with the corresponding
193/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
194///
195/// \headerfile <immintrin.h>
196///
197/// \code
198/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
199/// \endcode
200///
201/// This intrinsic corresponds to the \c VPDPWUSD instruction.
202///
203/// \param __W
204/// A 256-bit vector of [8 x int].
205/// \param __A
206/// A 256-bit vector of [16 x unsigned short].
207/// \param __B
208/// A 256-bit vector of [16 x short].
209/// \returns
210/// A 256-bit vector of [8 x int].
211///
212/// \code{.operation}
213/// FOR j := 0 to 7
214/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
215/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
216/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
217/// ENDFOR
218/// dst[MAX:256] := 0
219/// \endcode
220#define _mm256_dpwusd_epi32(__W, __A, __B) \
221 ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \
222 (__v8si)(__B)))
223
224/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
225/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
226/// signed 16-bit results. Sum these 2 results with the corresponding
227/// 32-bit integer in \a __W with signed saturation, and store the packed
228/// 32-bit results in \a dst.
229///
230/// \headerfile <immintrin.h>
231///
232/// \code
233/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
234/// \endcode
235///
236/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
237///
238/// \param __W
239/// A 128-bit vector of [4 x int].
240/// \param __A
241/// A 128-bit vector of [8 x unsigned short].
242/// \param __B
243/// A 128-bit vector of [8 x short].
244/// \returns
245/// A 128-bit vector of [4 x int].
246///
247/// \code{.operation}
248/// FOR j := 0 to 3
249/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
250/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
251/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
252/// ENDFOR
253/// dst[MAX:128] := 0
254/// \endcode
255#define _mm_dpwusds_epi32(__W, __A, __B) \
256 ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \
257 (__v4si)(__B)))
258
259/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
260/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
261/// signed 16-bit results. Sum these 2 results with the corresponding
262/// 32-bit integer in \a __W with signed saturation, and store the packed
263/// 32-bit results in \a dst.
264///
265/// \headerfile <immintrin.h>
266///
267/// \code
268/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
269/// \endcode
270///
271/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
272///
273/// \param __W
274/// A 256-bit vector of [8 x int].
275/// \param __A
276/// A 256-bit vector of [16 x unsigned short].
277/// \param __B
278/// A 256-bit vector of [16 x short].
279/// \returns
280/// A 256-bit vector of [8 x int].
281///
282/// \code{.operation}
283/// FOR j := 0 to 7
284/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
285/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
286/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
287/// ENDFOR
288/// dst[MAX:256] := 0
289/// \endcode
290#define _mm256_dpwusds_epi32(__W, __A, __B) \
291 ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \
292 (__v8si)(__B)))
293
294/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
295/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
296/// signed 16-bit results. Sum these 2 results with the corresponding
297/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
298///
299/// \headerfile <immintrin.h>
300///
301/// \code
302/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
303/// \endcode
304///
305/// This intrinsic corresponds to the \c VPDPWUUD instruction.
306///
307/// \param __W
308/// A 128-bit vector of [4 x unsigned int].
309/// \param __A
310/// A 128-bit vector of [8 x unsigned short].
311/// \param __B
312/// A 128-bit vector of [8 x unsigned short].
313/// \returns
314/// A 128-bit vector of [4 x unsigned int].
315///
316/// \code{.operation}
317/// FOR j := 0 to 3
318/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
319/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
320/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
321/// ENDFOR
322/// dst[MAX:128] := 0
323/// \endcode
324#define _mm_dpwuud_epi32(__W, __A, __B) \
325 ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \
326 (__v4si)(__B)))
327
328/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
329/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
330/// signed 16-bit results. Sum these 2 results with the corresponding
331/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
332///
333/// \headerfile <immintrin.h>
334///
335/// \code
336/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
337/// \endcode
338///
339/// This intrinsic corresponds to the \c VPDPWUUD instruction.
340///
341/// \param __W
342/// A 256-bit vector of [8 x unsigned int].
343/// \param __A
344/// A 256-bit vector of [16 x unsigned short].
345/// \param __B
346/// A 256-bit vector of [16 x unsigned short].
347/// \returns
348/// A 256-bit vector of [8 x unsigned int].
349///
350/// \code{.operation}
351/// FOR j := 0 to 7
352/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
353/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
354/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
355/// ENDFOR
356/// dst[MAX:256] := 0
357/// \endcode
358#define _mm256_dpwuud_epi32(__W, __A, __B) \
359 ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \
360 (__v8si)(__B)))
361
362/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
363/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
364/// signed 16-bit results. Sum these 2 results with the corresponding
365/// 32-bit integer in \a __W with signed saturation, and store the packed
366/// 32-bit results in \a dst.
367///
368/// \headerfile <immintrin.h>
369///
370/// \code
371/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
372/// \endcode
373///
374/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
375///
376/// \param __W
377/// A 128-bit vector of [4 x unsigned int].
378/// \param __A
379/// A 128-bit vector of [8 x unsigned short].
380/// \param __B
381/// A 128-bit vector of [8 x unsigned short].
382/// \returns
383/// A 128-bit vector of [4 x unsigned int].
384///
385/// \code{.operation}
386/// FOR j := 0 to 3
387/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
388/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
389/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
390/// ENDFOR
391/// dst[MAX:128] := 0
392/// \endcode
393#define _mm_dpwuuds_epi32(__W, __A, __B) \
394 ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \
395 (__v4si)(__B)))
396
397/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
398/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
399/// signed 16-bit results. Sum these 2 results with the corresponding
400/// 32-bit integer in \a __W with signed saturation, and store the packed
401/// 32-bit results in \a dst.
402///
403/// \headerfile <immintrin.h>
404///
405/// \code
406/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
407/// \endcode
408///
409/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
410///
411/// \param __W
412/// A 256-bit vector of [8 x unsigned int].
413/// \param __A
414/// A 256-bit vector of [16 x unsigned short].
415/// \param __B
416/// A 256-bit vector of [16 x unsigned short].
417/// \returns
418/// A 256-bit vector of [8 x unsigned int].
419///
420/// \code{.operation}
421/// FOR j := 0 to 7
422/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
423/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
424/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
425/// ENDFOR
426/// dst[MAX:256] := 0
427/// \endcode
428#define _mm256_dpwuuds_epi32(__W, __A, __B) \
429 ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \
430 (__v8si)(__B)))
431
432#endif // __AVXVNNIINT16INTRIN_H