clang 20.0.0git
avxvnniint8intrin.h
Go to the documentation of this file.
1/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error \
11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXVNNIINT8INTRIN_H
15#define __AVXVNNIINT8INTRIN_H
16
17/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
18/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
19/// signed 16-bit results. Sum these 4 results with the corresponding
20/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
21///
22/// \headerfile <x86intrin.h>
23///
24/// \code
25/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
26/// \endcode
27///
28/// This intrinsic corresponds to the \c VPDPBSSD instruction.
29///
30/// \param __A
31/// A 128-bit vector of [16 x char].
32/// \param __B
33/// A 128-bit vector of [16 x char].
34/// \returns
35/// A 128-bit vector of [4 x int].
36///
37/// \code{.operation}
38/// FOR j := 0 to 3
39/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
40/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
41/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
42/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
43/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
44/// ENDFOR
45/// dst[MAX:128] := 0
46/// \endcode
47#define _mm_dpbssd_epi32(__W, __A, __B) \
48 ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \
49 (__v4si)(__B)))
50
51/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
52/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
53/// signed 16-bit results. Sum these 4 results with the corresponding
54/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
55///
56/// \headerfile <x86intrin.h>
57///
58/// \code
59/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
60/// \endcode
61///
62/// This intrinsic corresponds to the \c VPDPBSSD instruction.
63///
64/// \param __A
65/// A 256-bit vector of [32 x char].
66/// \param __B
67/// A 256-bit vector of [32 x char].
68/// \returns
69/// A 256-bit vector of [8 x int].
70///
71/// \code{.operation}
72/// FOR j := 0 to 7
73/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
74/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
75/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
76/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
77/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
78/// ENDFOR
79/// dst[MAX:256] := 0
80/// \endcode
81#define _mm256_dpbssd_epi32(__W, __A, __B) \
82 ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \
83 (__v8si)(__B)))
84
85/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
86/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
87/// signed 16-bit results. Sum these 4 results with the corresponding
88/// 32-bit integer in \a __W with signed saturation, and store the packed
89/// 32-bit results in \a dst.
90///
91/// \headerfile <x86intrin.h>
92///
93/// \code
94/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
95/// \endcode
96///
97/// This intrinsic corresponds to the \c VPDPBSSD instruction.
98///
99/// \param __A
100/// A 128-bit vector of [16 x char].
101/// \param __B
102/// A 128-bit vector of [16 x char].
103/// \returns
104/// A 128-bit vector of [4 x int].
105///
106/// \code{.operation}
107/// FOR j := 0 to 3
108/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
109/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
110/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
111/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
112/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
113/// ENDFOR
114/// dst[MAX:128] := 0
115/// \endcode
116#define _mm_dpbssds_epi32(__W, __A, __B) \
117 ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \
118 (__v4si)(__B)))
119
120/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
121/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
122/// signed 16-bit results. Sum these 4 results with the corresponding
123/// 32-bit integer in \a __W with signed saturation, and store the packed
124/// 32-bit results in \a dst.
125///
126/// \headerfile <x86intrin.h>
127///
128/// \code
129/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
130/// \endcode
131///
132/// This intrinsic corresponds to the \c VPDPBSSD instruction.
133///
134/// \param __A
135/// A 256-bit vector of [32 x char].
136/// \param __B
137/// A 256-bit vector of [32 x char].
138/// \returns
139/// A 256-bit vector of [8 x int].
140///
141/// \code{.operation}
142/// FOR j := 0 to 7
143/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
144/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
145/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
146/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
147/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
148/// ENDFOR
149/// dst[MAX:256] := 0
150/// \endcode
151#define _mm256_dpbssds_epi32(__W, __A, __B) \
152 ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \
153 (__v8si)(__B)))
154
155/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
156/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
157/// signed 16-bit results. Sum these 4 results with the corresponding
158/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
159///
160/// \headerfile <x86intrin.h>
161///
162/// \code
163/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
164/// \endcode
165///
166/// This intrinsic corresponds to the \c VPDPBSSD instruction.
167///
168/// \param __A
169/// A 128-bit vector of [16 x char].
170/// \param __B
171/// A 128-bit vector of [16 x unsigned char].
172/// \returns
173/// A 128-bit vector of [4 x int].
174///
175/// \code{.operation}
176/// FOR j := 0 to 3
177/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
178/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
179/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
180/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
181/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
182/// ENDFOR
183/// dst[MAX:128] := 0
184/// \endcode
185#define _mm_dpbsud_epi32(__W, __A, __B) \
186 ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \
187 (__v4si)(__B)))
188
189/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
190/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
191/// signed 16-bit results. Sum these 4 results with the corresponding
192/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
193///
194/// \headerfile <x86intrin.h>
195///
196/// \code
197/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
198/// \endcode
199///
200/// This intrinsic corresponds to the \c VPDPBSSD instruction.
201///
202/// \param __A
203/// A 256-bit vector of [32 x char].
204/// \param __B
205/// A 256-bit vector of [32 x unsigned char].
206/// \returns
207/// A 256-bit vector of [8 x int].
208///
209/// \code{.operation}
210/// FOR j := 0 to 7
211/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
212/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
213/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
214/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
215/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
216/// ENDFOR
217/// dst[MAX:256] := 0
218/// \endcode
219#define _mm256_dpbsud_epi32(__W, __A, __B) \
220 ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \
221 (__v8si)(__B)))
222
223/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
224/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
225/// signed 16-bit results. Sum these 4 results with the corresponding
226/// 32-bit integer in \a __W with signed saturation, and store the packed
227/// 32-bit results in \a dst.
228///
229/// \headerfile <x86intrin.h>
230///
231/// \code
232/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
233/// \endcode
234///
235/// This intrinsic corresponds to the \c VPDPBSSD instruction.
236///
237/// \param __A
238/// A 128-bit vector of [16 x char].
239/// \param __B
240/// A 128-bit vector of [16 x unsigned char].
241/// \returns
242/// A 128-bit vector of [4 x int].
243///
244/// \code{.operation}
245/// FOR j := 0 to 3
246/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
247/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
248/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
249/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
250/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
251/// ENDFOR
252/// dst[MAX:128] := 0
253/// \endcode
254#define _mm_dpbsuds_epi32(__W, __A, __B) \
255 ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \
256 (__v4si)(__B)))
257
258/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
259/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
260/// signed 16-bit results. Sum these 4 results with the corresponding
261/// 32-bit integer in \a __W with signed saturation, and store the packed
262/// 32-bit results in \a dst.
263///
264/// \headerfile <x86intrin.h>
265///
266/// \code
267/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
268/// \endcode
269///
270/// This intrinsic corresponds to the \c VPDPBSSD instruction.
271///
272/// \param __A
273/// A 256-bit vector of [32 x char].
274/// \param __B
275/// A 256-bit vector of [32 x unsigned char].
276/// \returns
277/// A 256-bit vector of [8 x int].
278///
279/// \code{.operation}
280/// FOR j := 0 to 7
281/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
282/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
283/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
284/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
285/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
286/// ENDFOR
287/// dst[MAX:256] := 0
288/// \endcode
289#define _mm256_dpbsuds_epi32(__W, __A, __B) \
290 ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \
291 (__v8si)(__B)))
292
293/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
294/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
295/// signed 16-bit results. Sum these 4 results with the corresponding
296/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
297///
298/// \headerfile <x86intrin.h>
299///
300/// \code
301/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
302/// \endcode
303///
304/// This intrinsic corresponds to the \c VPDPBSSD instruction.
305///
306/// \param __A
307/// A 128-bit vector of [16 x unsigned char].
308/// \param __B
309/// A 128-bit vector of [16 x unsigned char].
310/// \returns
311/// A 128-bit vector of [4 x int].
312///
313/// \code{.operation}
314/// FOR j := 0 to 3
315/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
316/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
317/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
318/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
319/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
320/// ENDFOR
321/// dst[MAX:128] := 0
322/// \endcode
323#define _mm_dpbuud_epi32(__W, __A, __B) \
324 ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \
325 (__v4si)(__B)))
326
327/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
328/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
329/// signed 16-bit results. Sum these 4 results with the corresponding
330/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
331///
332/// \headerfile <x86intrin.h>
333///
334/// \code
335/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
336/// \endcode
337///
338/// This intrinsic corresponds to the \c VPDPBSSD instruction.
339///
340/// \param __A
341/// A 256-bit vector of [32 x unsigned char].
342/// \param __B
343/// A 256-bit vector of [32 x unsigned char].
344/// \returns
345/// A 256-bit vector of [8 x int].
346///
347/// \code{.operation}
348/// FOR j := 0 to 7
349/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
350/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
351/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
352/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
353/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
354/// ENDFOR
355/// dst[MAX:256] := 0
356/// \endcode
357#define _mm256_dpbuud_epi32(__W, __A, __B) \
358 ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \
359 (__v8si)(__B)))
360
361/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
362/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
363/// signed 16-bit results. Sum these 4 results with the corresponding
364/// 32-bit integer in \a __W with signed saturation, and store the packed
365/// 32-bit results in \a dst.
366///
367/// \headerfile <x86intrin.h>
368///
369/// \code
370/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
371/// \endcode
372///
373/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
374///
375/// \param __A
376/// A 128-bit vector of [16 x unsigned char].
377/// \param __B
378/// A 128-bit vector of [16 x unsigned char].
379/// \returns
380/// A 128-bit vector of [4 x int].
381///
382/// \code{.operation}
383/// FOR j := 0 to 3
384/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
385/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
386/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
387/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
388/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
389/// ENDFOR
390/// dst[MAX:128] := 0
391/// \endcode
392#define _mm_dpbuuds_epi32(__W, __A, __B) \
393 ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A), \
394 (__v4si)(__B)))
395
396/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
397/// signed 16-bit results. Sum these 4 results with the corresponding
398/// 32-bit integer in \a __W with signed saturation, and store the packed
399/// 32-bit results in \a dst.
400///
401/// \headerfile <x86intrin.h>
402///
403/// \code
404/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
405/// \endcode
406///
407/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
408///
409/// \param __A
410/// A 256-bit vector of [32 x unsigned char].
411/// \param __B
412/// A 256-bit vector of [32 x unsigned char].
413/// \returns
414/// A 256-bit vector of [8 x int].
415///
416/// \code{.operation}
417/// FOR j := 0 to 7
418/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
419/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
420/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
421/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
422/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
423/// ENDFOR
424/// dst[MAX:256] := 0
425/// \endcode
426#define _mm256_dpbuuds_epi32(__W, __A, __B) \
427 ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A), \
428 (__v8si)(__B)))
429
430#endif // __AVXVNNIINT8INTRIN_H