1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 NEON
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14/* Register macros */
15
16#define RTMP0 v8
17#define RTMP1 v9
18#define RTMP2 v10
19#define RTMP3 v11
20
21#define RTMP4 v12
22#define RTMP5 v13
23#define RTMP6 v14
24#define RTMP7 v15
25
26#define RX0 v12
27#define RX1 v13
28#define RKEY v14
29#define RIV v15
30
31/* Helper macros. */
32
33#define SM4_PREPARE() \
34 adr_l x5, crypto_sm4_sbox; \
35 ld1 {v16.16b-v19.16b}, [x5], #64; \
36 ld1 {v20.16b-v23.16b}, [x5], #64; \
37 ld1 {v24.16b-v27.16b}, [x5], #64; \
38 ld1 {v28.16b-v31.16b}, [x5];
39
40#define transpose_4x4(s0, s1, s2, s3) \
41 zip1 RTMP0.4s, s0.4s, s1.4s; \
42 zip1 RTMP1.4s, s2.4s, s3.4s; \
43 zip2 RTMP2.4s, s0.4s, s1.4s; \
44 zip2 RTMP3.4s, s2.4s, s3.4s; \
45 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
46 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
47 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
48 zip2 s3.2d, RTMP2.2d, RTMP3.2d;
49
50#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
51 zip1 RTMP0.4s, s0.4s, s1.4s; \
52 zip1 RTMP1.4s, s2.4s, s3.4s; \
53 zip2 RTMP2.4s, s0.4s, s1.4s; \
54 zip2 RTMP3.4s, s2.4s, s3.4s; \
55 zip1 RTMP4.4s, s4.4s, s5.4s; \
56 zip1 RTMP5.4s, s6.4s, s7.4s; \
57 zip2 RTMP6.4s, s4.4s, s5.4s; \
58 zip2 RTMP7.4s, s6.4s, s7.4s; \
59 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
60 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
61 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
62 zip2 s3.2d, RTMP2.2d, RTMP3.2d; \
63 zip1 s4.2d, RTMP4.2d, RTMP5.2d; \
64 zip2 s5.2d, RTMP4.2d, RTMP5.2d; \
65 zip1 s6.2d, RTMP6.2d, RTMP7.2d; \
66 zip2 s7.2d, RTMP6.2d, RTMP7.2d;
67
68#define rotate_clockwise_4x4(s0, s1, s2, s3) \
69 zip1 RTMP0.4s, s1.4s, s0.4s; \
70 zip2 RTMP1.4s, s1.4s, s0.4s; \
71 zip1 RTMP2.4s, s3.4s, s2.4s; \
72 zip2 RTMP3.4s, s3.4s, s2.4s; \
73 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
74 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
75 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
76 zip2 s3.2d, RTMP3.2d, RTMP1.2d;
77
78#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
79 zip1 RTMP0.4s, s1.4s, s0.4s; \
80 zip1 RTMP2.4s, s3.4s, s2.4s; \
81 zip2 RTMP1.4s, s1.4s, s0.4s; \
82 zip2 RTMP3.4s, s3.4s, s2.4s; \
83 zip1 RTMP4.4s, s5.4s, s4.4s; \
84 zip1 RTMP6.4s, s7.4s, s6.4s; \
85 zip2 RTMP5.4s, s5.4s, s4.4s; \
86 zip2 RTMP7.4s, s7.4s, s6.4s; \
87 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
88 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
89 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
90 zip2 s3.2d, RTMP3.2d, RTMP1.2d; \
91 zip1 s4.2d, RTMP6.2d, RTMP4.2d; \
92 zip2 s5.2d, RTMP6.2d, RTMP4.2d; \
93 zip1 s6.2d, RTMP7.2d, RTMP5.2d; \
94 zip2 s7.2d, RTMP7.2d, RTMP5.2d;
95
96#define ROUND4(round, s0, s1, s2, s3) \
97 dup RX0.4s, RKEY.s[round]; \
98 /* rk ^ s1 ^ s2 ^ s3 */ \
99 eor RTMP1.16b, s2.16b, s3.16b; \
100 eor RX0.16b, RX0.16b, s1.16b; \
101 eor RX0.16b, RX0.16b, RTMP1.16b; \
102 \
103 /* sbox, non-linear part */ \
104 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
105 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
106 sub RX0.16b, RX0.16b, RTMP3.16b; \
107 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
108 sub RX0.16b, RX0.16b, RTMP3.16b; \
109 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
110 sub RX0.16b, RX0.16b, RTMP3.16b; \
111 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
112 \
113 /* linear part */ \
114 shl RTMP1.4s, RTMP0.4s, #8; \
115 shl RTMP2.4s, RTMP0.4s, #16; \
116 shl RTMP3.4s, RTMP0.4s, #24; \
117 sri RTMP1.4s, RTMP0.4s, #(32-8); \
118 sri RTMP2.4s, RTMP0.4s, #(32-16); \
119 sri RTMP3.4s, RTMP0.4s, #(32-24); \
120 /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
121 eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
122 eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
123 /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
124 eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
125 shl RTMP2.4s, RTMP1.4s, 2; \
126 sri RTMP2.4s, RTMP1.4s, #(32-2); \
127 eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
128 /* s0 ^= RTMP3 */ \
129 eor s0.16b, s0.16b, RTMP3.16b;
130
131#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
132 mov x6, 8; \
1334: \
134 ld1 {RKEY.4s}, [x0], #16; \
135 subs x6, x6, #1; \
136 \
137 ROUND4(0, b0, b1, b2, b3); \
138 ROUND4(1, b1, b2, b3, b0); \
139 ROUND4(2, b2, b3, b0, b1); \
140 ROUND4(3, b3, b0, b1, b2); \
141 \
142 bne 4b; \
143 \
144 rev32 b0.16b, b0.16b; \
145 rev32 b1.16b, b1.16b; \
146 rev32 b2.16b, b2.16b; \
147 rev32 b3.16b, b3.16b; \
148 \
149 rotate_clockwise_4x4(b0, b1, b2, b3); \
150 \
151 /* repoint to rkey */ \
152 sub x0, x0, #128;
153
154#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
155 rev32 b0.16b, b0.16b; \
156 rev32 b1.16b, b1.16b; \
157 rev32 b2.16b, b2.16b; \
158 rev32 b3.16b, b3.16b; \
159 SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
160
161#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
162 /* rk ^ s1 ^ s2 ^ s3 */ \
163 dup RX0.4s, RKEY.s[round]; \
164 eor RTMP0.16b, s2.16b, s3.16b; \
165 mov RX1.16b, RX0.16b; \
166 eor RTMP1.16b, t2.16b, t3.16b; \
167 eor RX0.16b, RX0.16b, s1.16b; \
168 eor RX1.16b, RX1.16b, t1.16b; \
169 eor RX0.16b, RX0.16b, RTMP0.16b; \
170 eor RX1.16b, RX1.16b, RTMP1.16b; \
171 \
172 /* sbox, non-linear part */ \
173 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
174 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
175 tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
176 sub RX0.16b, RX0.16b, RTMP3.16b; \
177 sub RX1.16b, RX1.16b, RTMP3.16b; \
178 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
179 tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
180 sub RX0.16b, RX0.16b, RTMP3.16b; \
181 sub RX1.16b, RX1.16b, RTMP3.16b; \
182 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
183 tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
184 sub RX0.16b, RX0.16b, RTMP3.16b; \
185 sub RX1.16b, RX1.16b, RTMP3.16b; \
186 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
187 tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
188 \
189 /* linear part */ \
190 shl RX0.4s, RTMP0.4s, #8; \
191 shl RX1.4s, RTMP1.4s, #8; \
192 shl RTMP2.4s, RTMP0.4s, #16; \
193 shl RTMP3.4s, RTMP1.4s, #16; \
194 sri RX0.4s, RTMP0.4s, #(32 - 8); \
195 sri RX1.4s, RTMP1.4s, #(32 - 8); \
196 sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
197 sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
198 /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
199 eor RX0.16b, RX0.16b, RTMP0.16b; \
200 eor RX1.16b, RX1.16b, RTMP1.16b; \
201 eor RX0.16b, RX0.16b, RTMP2.16b; \
202 eor RX1.16b, RX1.16b, RTMP3.16b; \
203 /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
204 shl RTMP2.4s, RTMP0.4s, #24; \
205 shl RTMP3.4s, RTMP1.4s, #24; \
206 sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
207 sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
208 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
209 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
210 shl RTMP2.4s, RX0.4s, #2; \
211 shl RTMP3.4s, RX1.4s, #2; \
212 sri RTMP2.4s, RX0.4s, #(32 - 2); \
213 sri RTMP3.4s, RX1.4s, #(32 - 2); \
214 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
215 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
216 /* s0/t0 ^= RTMP0/1 */ \
217 eor s0.16b, s0.16b, RTMP0.16b; \
218 eor t0.16b, t0.16b, RTMP1.16b;
219
220#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
221 rev32 b0.16b, b0.16b; \
222 rev32 b1.16b, b1.16b; \
223 rev32 b2.16b, b2.16b; \
224 rev32 b3.16b, b3.16b; \
225 rev32 b4.16b, b4.16b; \
226 rev32 b5.16b, b5.16b; \
227 rev32 b6.16b, b6.16b; \
228 rev32 b7.16b, b7.16b; \
229 \
230 mov x6, 8; \
2318: \
232 ld1 {RKEY.4s}, [x0], #16; \
233 subs x6, x6, #1; \
234 \
235 ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
236 ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
237 ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
238 ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
239 \
240 bne 8b; \
241 \
242 rev32 b0.16b, b0.16b; \
243 rev32 b1.16b, b1.16b; \
244 rev32 b2.16b, b2.16b; \
245 rev32 b3.16b, b3.16b; \
246 rev32 b4.16b, b4.16b; \
247 rev32 b5.16b, b5.16b; \
248 rev32 b6.16b, b6.16b; \
249 rev32 b7.16b, b7.16b; \
250 \
251 /* repoint to rkey */ \
252 sub x0, x0, #128;
253
254#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
255 SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \
256 rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \
257
258
259.align 3
260SYM_FUNC_START(sm4_neon_crypt)
261 /* input:
262 * x0: round key array, CTX
263 * x1: dst
264 * x2: src
265 * w3: nblocks
266 */
267 SM4_PREPARE()
268
269.Lcrypt_loop_8x:
270 sub w3, w3, #8
271 tbnz w3, #31, .Lcrypt_4x
272
273 ld4 {v0.4s-v3.4s}, [x2], #64
274 ld4 {v4.4s-v7.4s}, [x2], #64
275
276 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
277
278 st1 {v0.16b-v3.16b}, [x1], #64
279 st1 {v4.16b-v7.16b}, [x1], #64
280
281 cbz w3, .Lcrypt_end
282 b .Lcrypt_loop_8x
283
284.Lcrypt_4x:
285 add w3, w3, #8
286 cmp w3, #4
287 blt .Lcrypt_tail
288
289 sub w3, w3, #4
290
291 ld4 {v0.4s-v3.4s}, [x2], #64
292
293 SM4_CRYPT_BLK4(v0, v1, v2, v3)
294
295 st1 {v0.16b-v3.16b}, [x1], #64
296
297 cbz w3, .Lcrypt_end
298
299.Lcrypt_tail:
300 cmp w3, #2
301 ld1 {v0.16b}, [x2], #16
302 blt .Lcrypt_tail_load_done
303 ld1 {v1.16b}, [x2], #16
304 beq .Lcrypt_tail_load_done
305 ld1 {v2.16b}, [x2], #16
306
307.Lcrypt_tail_load_done:
308 transpose_4x4(v0, v1, v2, v3)
309
310 SM4_CRYPT_BLK4(v0, v1, v2, v3)
311
312 cmp w3, #2
313 st1 {v0.16b}, [x1], #16
314 blt .Lcrypt_end
315 st1 {v1.16b}, [x1], #16
316 beq .Lcrypt_end
317 st1 {v2.16b}, [x1], #16
318
319.Lcrypt_end:
320 ret
321SYM_FUNC_END(sm4_neon_crypt)
322
323.align 3
324SYM_FUNC_START(sm4_neon_cbc_dec)
325 /* input:
326 * x0: round key array, CTX
327 * x1: dst
328 * x2: src
329 * x3: iv (big endian, 128 bit)
330 * w4: nblocks
331 */
332 SM4_PREPARE()
333
334 ld1 {RIV.16b}, [x3]
335
336.Lcbc_dec_loop_8x:
337 sub w4, w4, #8
338 tbnz w4, #31, .Lcbc_dec_4x
339
340 ld4 {v0.4s-v3.4s}, [x2], #64
341 ld4 {v4.4s-v7.4s}, [x2]
342
343 SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
344
345 /* Avoid overwriting the RIV register */
346 rotate_clockwise_4x4(v0, v1, v2, v3)
347 rotate_clockwise_4x4(v4, v5, v6, v7)
348
349 sub x2, x2, #64
350
351 eor v0.16b, v0.16b, RIV.16b
352
353 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
354 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
355
356 eor v1.16b, v1.16b, RTMP0.16b
357 eor v2.16b, v2.16b, RTMP1.16b
358 eor v3.16b, v3.16b, RTMP2.16b
359 eor v4.16b, v4.16b, RTMP3.16b
360 eor v5.16b, v5.16b, RTMP4.16b
361 eor v6.16b, v6.16b, RTMP5.16b
362 eor v7.16b, v7.16b, RTMP6.16b
363
364 mov RIV.16b, RTMP7.16b
365
366 st1 {v0.16b-v3.16b}, [x1], #64
367 st1 {v4.16b-v7.16b}, [x1], #64
368
369 cbz w4, .Lcbc_dec_end
370 b .Lcbc_dec_loop_8x
371
372.Lcbc_dec_4x:
373 add w4, w4, #8
374 cmp w4, #4
375 blt .Lcbc_dec_tail
376
377 sub w4, w4, #4
378
379 ld1 {v0.16b-v3.16b}, [x2], #64
380
381 rev32 v4.16b, v0.16b
382 rev32 v5.16b, v1.16b
383 rev32 v6.16b, v2.16b
384 rev32 v7.16b, v3.16b
385
386 transpose_4x4(v4, v5, v6, v7)
387
388 SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
389
390 eor v4.16b, v4.16b, RIV.16b
391 eor v5.16b, v5.16b, v0.16b
392 eor v6.16b, v6.16b, v1.16b
393 eor v7.16b, v7.16b, v2.16b
394
395 mov RIV.16b, v3.16b
396
397 st1 {v4.16b-v7.16b}, [x1], #64
398
399 cbz w4, .Lcbc_dec_end
400
401.Lcbc_dec_tail:
402 cmp w4, #2
403 ld1 {v0.16b}, [x2], #16
404 blt .Lcbc_dec_tail_load_done
405 ld1 {v1.16b}, [x2], #16
406 beq .Lcbc_dec_tail_load_done
407 ld1 {v2.16b}, [x2], #16
408
409.Lcbc_dec_tail_load_done:
410 rev32 v4.16b, v0.16b
411 rev32 v5.16b, v1.16b
412 rev32 v6.16b, v2.16b
413
414 transpose_4x4(v4, v5, v6, v7)
415
416 SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
417
418 cmp w4, #2
419 eor v4.16b, v4.16b, RIV.16b
420 mov RIV.16b, v0.16b
421 st1 {v4.16b}, [x1], #16
422 blt .Lcbc_dec_end
423
424 eor v5.16b, v5.16b, v0.16b
425 mov RIV.16b, v1.16b
426 st1 {v5.16b}, [x1], #16
427 beq .Lcbc_dec_end
428
429 eor v6.16b, v6.16b, v1.16b
430 mov RIV.16b, v2.16b
431 st1 {v6.16b}, [x1], #16
432
433.Lcbc_dec_end:
434 /* store new IV */
435 st1 {RIV.16b}, [x3]
436
437 ret
438SYM_FUNC_END(sm4_neon_cbc_dec)
439
440.align 3
441SYM_FUNC_START(sm4_neon_ctr_crypt)
442 /* input:
443 * x0: round key array, CTX
444 * x1: dst
445 * x2: src
446 * x3: ctr (big endian, 128 bit)
447 * w4: nblocks
448 */
449 SM4_PREPARE()
450
451 ldp x7, x8, [x3]
452 rev x7, x7
453 rev x8, x8
454
455.Lctr_crypt_loop_8x:
456 sub w4, w4, #8
457 tbnz w4, #31, .Lctr_crypt_4x
458
459#define inc_le128(vctr) \
460 mov vctr.d[1], x8; \
461 mov vctr.d[0], x7; \
462 adds x8, x8, #1; \
463 rev64 vctr.16b, vctr.16b; \
464 adc x7, x7, xzr;
465
466 /* construct CTRs */
467 inc_le128(v0) /* +0 */
468 inc_le128(v1) /* +1 */
469 inc_le128(v2) /* +2 */
470 inc_le128(v3) /* +3 */
471 inc_le128(v4) /* +4 */
472 inc_le128(v5) /* +5 */
473 inc_le128(v6) /* +6 */
474 inc_le128(v7) /* +7 */
475
476 transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
477
478 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
479
480 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
481 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
482
483 eor v0.16b, v0.16b, RTMP0.16b
484 eor v1.16b, v1.16b, RTMP1.16b
485 eor v2.16b, v2.16b, RTMP2.16b
486 eor v3.16b, v3.16b, RTMP3.16b
487 eor v4.16b, v4.16b, RTMP4.16b
488 eor v5.16b, v5.16b, RTMP5.16b
489 eor v6.16b, v6.16b, RTMP6.16b
490 eor v7.16b, v7.16b, RTMP7.16b
491
492 st1 {v0.16b-v3.16b}, [x1], #64
493 st1 {v4.16b-v7.16b}, [x1], #64
494
495 cbz w4, .Lctr_crypt_end
496 b .Lctr_crypt_loop_8x
497
498.Lctr_crypt_4x:
499 add w4, w4, #8
500 cmp w4, #4
501 blt .Lctr_crypt_tail
502
503 sub w4, w4, #4
504
505 /* construct CTRs */
506 inc_le128(v0) /* +0 */
507 inc_le128(v1) /* +1 */
508 inc_le128(v2) /* +2 */
509 inc_le128(v3) /* +3 */
510
511 ld1 {v4.16b-v7.16b}, [x2], #64
512
513 transpose_4x4(v0, v1, v2, v3)
514
515 SM4_CRYPT_BLK4(v0, v1, v2, v3)
516
517 eor v0.16b, v0.16b, v4.16b
518 eor v1.16b, v1.16b, v5.16b
519 eor v2.16b, v2.16b, v6.16b
520 eor v3.16b, v3.16b, v7.16b
521
522 st1 {v0.16b-v3.16b}, [x1], #64
523
524 cbz w4, .Lctr_crypt_end
525
526.Lctr_crypt_tail:
527 /* inc_le128 will change the sign bit */
528 ld1 {v4.16b}, [x2], #16
529 inc_le128(v0)
530 cmp w4, #2
531 blt .Lctr_crypt_tail_load_done
532
533 ld1 {v5.16b}, [x2], #16
534 inc_le128(v1)
535 cmp w4, #2
536 beq .Lctr_crypt_tail_load_done
537
538 ld1 {v6.16b}, [x2], #16
539 inc_le128(v2)
540
541.Lctr_crypt_tail_load_done:
542 transpose_4x4(v0, v1, v2, v3)
543
544 SM4_CRYPT_BLK4(v0, v1, v2, v3)
545
546 cmp w4, #2
547
548 eor v0.16b, v0.16b, v4.16b
549 st1 {v0.16b}, [x1], #16
550 blt .Lctr_crypt_end
551
552 eor v1.16b, v1.16b, v5.16b
553 st1 {v1.16b}, [x1], #16
554 beq .Lctr_crypt_end
555
556 eor v2.16b, v2.16b, v6.16b
557 st1 {v2.16b}, [x1], #16
558
559.Lctr_crypt_end:
560 /* store new CTR */
561 rev x7, x7
562 rev x8, x8
563 stp x7, x8, [x3]
564
565 ret
566SYM_FUNC_END(sm4_neon_ctr_crypt)
567

source code of linux/arch/arm64/crypto/sm4-neon-core.S