clang 20.0.0git
sm4intrin.h
Go to the documentation of this file.
1/*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
12#endif // __IMMINTRIN_H
13
14#ifndef __SM4INTRIN_H
15#define __SM4INTRIN_H
16
17/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
18/// operates on independent 128-bit lanes. The calculated results are
19/// stored in \a dst.
20/// \headerfile <immintrin.h>
21///
22/// \code
23/// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B)
24/// \endcode
25///
26/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
27///
28/// \param __A
29/// A 128-bit vector of [4 x int].
30/// \param __B
31/// A 128-bit vector of [4 x int].
32/// \returns
33/// A 128-bit vector of [4 x int].
34///
35/// \code{.operation}
36/// DEFINE ROL32(dword, n) {
37/// count := n % 32
38/// dest := (dword << count) | (dword >> (32-count))
39/// RETURN dest
40/// }
41/// DEFINE SBOX_BYTE(dword, i) {
42/// RETURN sbox[dword.byte[i]]
43/// }
44/// DEFINE lower_t(dword) {
45/// tmp.byte[0] := SBOX_BYTE(dword, 0)
46/// tmp.byte[1] := SBOX_BYTE(dword, 1)
47/// tmp.byte[2] := SBOX_BYTE(dword, 2)
48/// tmp.byte[3] := SBOX_BYTE(dword, 3)
49/// RETURN tmp
50/// }
51/// DEFINE L_KEY(dword) {
52/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
53/// }
54/// DEFINE T_KEY(dword) {
55/// RETURN L_KEY(lower_t(dword))
56/// }
57/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
58/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
59/// }
60/// FOR i:= 0 to 0
61/// P[0] := __B.xmm[i].dword[0]
62/// P[1] := __B.xmm[i].dword[1]
63/// P[2] := __B.xmm[i].dword[2]
64/// P[3] := __B.xmm[i].dword[3]
65/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
66/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
67/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
68/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
69/// DEST.xmm[i].dword[0] := C[0]
70/// DEST.xmm[i].dword[1] := C[1]
71/// DEST.xmm[i].dword[2] := C[2]
72/// DEST.xmm[i].dword[3] := C[3]
73/// ENDFOR
74/// DEST[MAX:128] := 0
75/// \endcode
76#define _mm_sm4key4_epi32(A, B) \
77 (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
78
79/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
80/// operates on independent 128-bit lanes. The calculated results are
81/// stored in \a dst.
82/// \headerfile <immintrin.h>
83///
84/// \code
85/// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
86/// \endcode
87///
88/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
89///
90/// \param __A
91/// A 256-bit vector of [8 x int].
92/// \param __B
93/// A 256-bit vector of [8 x int].
94/// \returns
95/// A 256-bit vector of [8 x int].
96///
97/// \code{.operation}
98/// DEFINE ROL32(dword, n) {
99/// count := n % 32
100/// dest := (dword << count) | (dword >> (32-count))
101/// RETURN dest
102/// }
103/// DEFINE SBOX_BYTE(dword, i) {
104/// RETURN sbox[dword.byte[i]]
105/// }
106/// DEFINE lower_t(dword) {
107/// tmp.byte[0] := SBOX_BYTE(dword, 0)
108/// tmp.byte[1] := SBOX_BYTE(dword, 1)
109/// tmp.byte[2] := SBOX_BYTE(dword, 2)
110/// tmp.byte[3] := SBOX_BYTE(dword, 3)
111/// RETURN tmp
112/// }
113/// DEFINE L_KEY(dword) {
114/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
115/// }
116/// DEFINE T_KEY(dword) {
117/// RETURN L_KEY(lower_t(dword))
118/// }
119/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
120/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
121/// }
122/// FOR i:= 0 to 1
123/// P[0] := __B.xmm[i].dword[0]
124/// P[1] := __B.xmm[i].dword[1]
125/// P[2] := __B.xmm[i].dword[2]
126/// P[3] := __B.xmm[i].dword[3]
127/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
128/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
129/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
130/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
131/// DEST.xmm[i].dword[0] := C[0]
132/// DEST.xmm[i].dword[1] := C[1]
133/// DEST.xmm[i].dword[2] := C[2]
134/// DEST.xmm[i].dword[3] := C[3]
135/// ENDFOR
136/// DEST[MAX:256] := 0
137/// \endcode
138#define _mm256_sm4key4_epi32(A, B) \
139 (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
140
141/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
142/// operates on independent 128-bit lanes. The calculated results are
143/// stored in \a dst.
144/// \headerfile <immintrin.h>
145///
146/// \code
147/// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
148/// \endcode
149///
150/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
151///
152/// \param __A
153/// A 128-bit vector of [4 x int].
154/// \param __B
155/// A 128-bit vector of [4 x int].
156/// \returns
157/// A 128-bit vector of [4 x int].
158///
159/// \code{.operation}
160/// DEFINE ROL32(dword, n) {
161/// count := n % 32
162/// dest := (dword << count) | (dword >> (32-count))
163/// RETURN dest
164/// }
165/// DEFINE lower_t(dword) {
166/// tmp.byte[0] := SBOX_BYTE(dword, 0)
167/// tmp.byte[1] := SBOX_BYTE(dword, 1)
168/// tmp.byte[2] := SBOX_BYTE(dword, 2)
169/// tmp.byte[3] := SBOX_BYTE(dword, 3)
170/// RETURN tmp
171/// }
172/// DEFINE L_RND(dword) {
173/// tmp := dword
174/// tmp := tmp ^ ROL32(dword, 2)
175/// tmp := tmp ^ ROL32(dword, 10)
176/// tmp := tmp ^ ROL32(dword, 18)
177/// tmp := tmp ^ ROL32(dword, 24)
178/// RETURN tmp
179/// }
180/// DEFINE T_RND(dword) {
181/// RETURN L_RND(lower_t(dword))
182/// }
183/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
184/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
185/// }
186/// FOR i:= 0 to 0
187/// P[0] := __B.xmm[i].dword[0]
188/// P[1] := __B.xmm[i].dword[1]
189/// P[2] := __B.xmm[i].dword[2]
190/// P[3] := __B.xmm[i].dword[3]
191/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
192/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
193/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
194/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
195/// DEST.xmm[i].dword[0] := C[0]
196/// DEST.xmm[i].dword[1] := C[1]
197/// DEST.xmm[i].dword[2] := C[2]
198/// DEST.xmm[i].dword[3] := C[3]
199/// ENDFOR
200/// DEST[MAX:128] := 0
201/// \endcode
202#define _mm_sm4rnds4_epi32(A, B) \
203 (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
204
205/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
206/// operates on independent 128-bit lanes. The calculated results are
207/// stored in \a dst.
208/// \headerfile <immintrin.h>
209///
210/// \code
211/// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
212/// \endcode
213///
214/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
215///
216/// \param __A
217/// A 256-bit vector of [8 x int].
218/// \param __B
219/// A 256-bit vector of [8 x int].
220/// \returns
221/// A 256-bit vector of [8 x int].
222///
223/// \code{.operation}
224/// DEFINE ROL32(dword, n) {
225/// count := n % 32
226/// dest := (dword << count) | (dword >> (32-count))
227/// RETURN dest
228/// }
229/// DEFINE lower_t(dword) {
230/// tmp.byte[0] := SBOX_BYTE(dword, 0)
231/// tmp.byte[1] := SBOX_BYTE(dword, 1)
232/// tmp.byte[2] := SBOX_BYTE(dword, 2)
233/// tmp.byte[3] := SBOX_BYTE(dword, 3)
234/// RETURN tmp
235/// }
236/// DEFINE L_RND(dword) {
237/// tmp := dword
238/// tmp := tmp ^ ROL32(dword, 2)
239/// tmp := tmp ^ ROL32(dword, 10)
240/// tmp := tmp ^ ROL32(dword, 18)
241/// tmp := tmp ^ ROL32(dword, 24)
242/// RETURN tmp
243/// }
244/// DEFINE T_RND(dword) {
245/// RETURN L_RND(lower_t(dword))
246/// }
247/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
248/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
249/// }
250/// FOR i:= 0 to 0
251/// P[0] := __B.xmm[i].dword[0]
252/// P[1] := __B.xmm[i].dword[1]
253/// P[2] := __B.xmm[i].dword[2]
254/// P[3] := __B.xmm[i].dword[3]
255/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
256/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
257/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
258/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
259/// DEST.xmm[i].dword[0] := C[0]
260/// DEST.xmm[i].dword[1] := C[1]
261/// DEST.xmm[i].dword[2] := C[2]
262/// DEST.xmm[i].dword[3] := C[3]
263/// ENDFOR
264/// DEST[MAX:256] := 0
265/// \endcode
266#define _mm256_sm4rnds4_epi32(A, B) \
267 (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
268
269#endif // __SM4INTRIN_H