clang 20.0.0git
avxvnniintrin.h
Go to the documentation of this file.
1/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
2 *
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 * THE SOFTWARE.
21 *
22 *===-----------------------------------------------------------------------===
23 */
24#ifndef __IMMINTRIN_H
25#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28#ifndef __AVXVNNIINTRIN_H
29#define __AVXVNNIINTRIN_H
30
31/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
32/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
33/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
34/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
35/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
36/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
37/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
38/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
39/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
40
41/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
42/* Define the default attributes for the functions in this file. */
43#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
44#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
45
46/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
47/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
48/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
49/// in \a __S, and store the packed 32-bit results in DST.
50///
51/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
52///
53/// \code{.operation}
54/// FOR j := 0 to 7
55/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
56/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
57/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
58/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
59/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
60/// ENDFOR
61/// DST[MAX:256] := 0
62/// \endcode
63static __inline__ __m256i __DEFAULT_FN_ATTRS256
64_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
65{
66 return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
67}
68
69/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
70/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
71/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
72/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
73///
74/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
75///
76/// \code{.operation}
77/// FOR j := 0 to 7
78/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
79/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
80/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
81/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
82/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
83/// ENDFOR
84/// DST[MAX:256] := 0
85/// \endcode
86static __inline__ __m256i __DEFAULT_FN_ATTRS256
87_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
88{
89 return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
90}
91
92/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
93/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
94/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
95/// and store the packed 32-bit results in DST.
96///
97/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
98///
99/// \code{.operation}
100/// FOR j := 0 to 7
101/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
102/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
103/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
104/// ENDFOR
105/// DST[MAX:256] := 0
106/// \endcode
107static __inline__ __m256i __DEFAULT_FN_ATTRS256
108_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
109{
110 return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
111}
112
113/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
114/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
115/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
116/// using signed saturation, and store the packed 32-bit results in DST.
117///
118/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
119///
120/// \code{.operation}
121/// FOR j := 0 to 7
122/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
123/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
124/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
125/// ENDFOR
126/// DST[MAX:256] := 0
127/// \endcode
128static __inline__ __m256i __DEFAULT_FN_ATTRS256
129_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
130{
131 return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
132}
133
134/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
135/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
136/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
137/// in \a __S, and store the packed 32-bit results in DST.
138///
139/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
140///
141/// \code{.operation}
142/// FOR j := 0 to 3
143/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
144/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
145/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
146/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
147/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
148/// ENDFOR
149/// DST[MAX:128] := 0
150/// \endcode
151static __inline__ __m128i __DEFAULT_FN_ATTRS128
152_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
153{
154 return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
155}
156
157/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
158/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
159/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
160/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
161///
162/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
163///
164/// \code{.operation}
165/// FOR j := 0 to 3
166/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
167/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
168/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
169/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
170/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
171/// ENDFOR
172/// DST[MAX:128] := 0
173/// \endcode
174static __inline__ __m128i __DEFAULT_FN_ATTRS128
175_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
176{
177 return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
178}
179
180/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
181/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
182/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
183/// and store the packed 32-bit results in DST.
184///
185/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
186///
187/// \code{.operation}
188/// FOR j := 0 to 3
189/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
190/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
191/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
192/// ENDFOR
193/// DST[MAX:128] := 0
194/// \endcode
195static __inline__ __m128i __DEFAULT_FN_ATTRS128
196_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
197{
198 return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
199}
200
201/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
202/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
203/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
204/// using signed saturation, and store the packed 32-bit results in DST.
205///
206/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
207///
208/// \code{.operation}
209/// FOR j := 0 to 3
210/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
211/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
212/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
213/// ENDFOR
214/// DST[MAX:128] := 0
215/// \endcode
216static __inline__ __m128i __DEFAULT_FN_ATTRS128
217_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
218{
219 return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
220}
221
222#undef __DEFAULT_FN_ATTRS128
223#undef __DEFAULT_FN_ATTRS256
224
225#endif // __AVXVNNIINTRIN_H
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding 16-bit intege...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding signed 8-bit...
Definition: avxvnniintrin.h:87
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding 16-bit intege...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding 16-bit intege...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding signed 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding 16-bit intege...
#define __DEFAULT_FN_ATTRS256
Definition: avxvnniintrin.h:43
#define __DEFAULT_FN_ATTRS128
Definition: avxvnniintrin.h:44
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding signed 8-bit...
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding signed 8-bit...
Definition: avxvnniintrin.h:64