1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13#include "sm4-ce-asm.h"
14
15.arch armv8-a+crypto
16
17.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18 20, 24, 25, 26, 27, 28, 29, 30, 31
19 .set .Lv\b\().4s, \b
20.endr
21
22.macro sm4e, vd, vn
23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd
24.endm
25
26.macro sm4ekey, vd, vn, vm
27 .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
28.endm
29
30/* Register macros */
31
32#define RTMP0 v16
33#define RTMP1 v17
34#define RTMP2 v18
35#define RTMP3 v19
36
37#define RIV v20
38#define RMAC v20
39#define RMASK v21
40
41
42.align 3
43SYM_FUNC_START(sm4_ce_expand_key)
44 /* input:
45 * x0: 128-bit key
46 * x1: rkey_enc
47 * x2: rkey_dec
48 * x3: fk array
49 * x4: ck array
50 */
51 ld1 {v0.16b}, [x0];
52 rev32 v0.16b, v0.16b;
53 ld1 {v1.16b}, [x3];
54 /* load ck */
55 ld1 {v24.16b-v27.16b}, [x4], #64;
56 ld1 {v28.16b-v31.16b}, [x4];
57
58 /* input ^ fk */
59 eor v0.16b, v0.16b, v1.16b;
60
61 sm4ekey v0.4s, v0.4s, v24.4s;
62 sm4ekey v1.4s, v0.4s, v25.4s;
63 sm4ekey v2.4s, v1.4s, v26.4s;
64 sm4ekey v3.4s, v2.4s, v27.4s;
65 sm4ekey v4.4s, v3.4s, v28.4s;
66 sm4ekey v5.4s, v4.4s, v29.4s;
67 sm4ekey v6.4s, v5.4s, v30.4s;
68 sm4ekey v7.4s, v6.4s, v31.4s;
69
70 adr_l x5, .Lbswap128_mask
71 ld1 {v24.16b}, [x5]
72
73 st1 {v0.16b-v3.16b}, [x1], #64;
74 st1 {v4.16b-v7.16b}, [x1];
75
76 tbl v16.16b, {v7.16b}, v24.16b
77 tbl v17.16b, {v6.16b}, v24.16b
78 tbl v18.16b, {v5.16b}, v24.16b
79 tbl v19.16b, {v4.16b}, v24.16b
80 tbl v20.16b, {v3.16b}, v24.16b
81 tbl v21.16b, {v2.16b}, v24.16b
82 tbl v22.16b, {v1.16b}, v24.16b
83 tbl v23.16b, {v0.16b}, v24.16b
84
85 st1 {v16.16b-v19.16b}, [x2], #64
86 st1 {v20.16b-v23.16b}, [x2]
87
88 ret;
89SYM_FUNC_END(sm4_ce_expand_key)
90
91.align 3
92SYM_FUNC_START(sm4_ce_crypt_block)
93 /* input:
94 * x0: round key array, CTX
95 * x1: dst
96 * x2: src
97 */
98 SM4_PREPARE(x0)
99
100 ld1 {v0.16b}, [x2];
101 SM4_CRYPT_BLK(v0);
102 st1 {v0.16b}, [x1];
103
104 ret;
105SYM_FUNC_END(sm4_ce_crypt_block)
106
107.align 3
108SYM_FUNC_START(sm4_ce_crypt)
109 /* input:
110 * x0: round key array, CTX
111 * x1: dst
112 * x2: src
113 * w3: nblocks
114 */
115 SM4_PREPARE(x0)
116
117.Lcrypt_loop_blk:
118 sub w3, w3, #8;
119 tbnz w3, #31, .Lcrypt_tail8;
120
121 ld1 {v0.16b-v3.16b}, [x2], #64;
122 ld1 {v4.16b-v7.16b}, [x2], #64;
123
124 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
125
126 st1 {v0.16b-v3.16b}, [x1], #64;
127 st1 {v4.16b-v7.16b}, [x1], #64;
128
129 cbz w3, .Lcrypt_end;
130 b .Lcrypt_loop_blk;
131
132.Lcrypt_tail8:
133 add w3, w3, #8;
134 cmp w3, #4;
135 blt .Lcrypt_tail4;
136
137 sub w3, w3, #4;
138
139 ld1 {v0.16b-v3.16b}, [x2], #64;
140 SM4_CRYPT_BLK4(v0, v1, v2, v3);
141 st1 {v0.16b-v3.16b}, [x1], #64;
142
143 cbz w3, .Lcrypt_end;
144
145.Lcrypt_tail4:
146 sub w3, w3, #1;
147
148 ld1 {v0.16b}, [x2], #16;
149 SM4_CRYPT_BLK(v0);
150 st1 {v0.16b}, [x1], #16;
151
152 cbnz w3, .Lcrypt_tail4;
153
154.Lcrypt_end:
155 ret;
156SYM_FUNC_END(sm4_ce_crypt)
157
158.align 3
159SYM_FUNC_START(sm4_ce_cbc_enc)
160 /* input:
161 * x0: round key array, CTX
162 * x1: dst
163 * x2: src
164 * x3: iv (big endian, 128 bit)
165 * w4: nblocks
166 */
167 SM4_PREPARE(x0)
168
169 ld1 {RIV.16b}, [x3]
170
171.Lcbc_enc_loop_4x:
172 cmp w4, #4
173 blt .Lcbc_enc_loop_1x
174
175 sub w4, w4, #4
176
177 ld1 {v0.16b-v3.16b}, [x2], #64
178
179 eor v0.16b, v0.16b, RIV.16b
180 SM4_CRYPT_BLK(v0)
181 eor v1.16b, v1.16b, v0.16b
182 SM4_CRYPT_BLK(v1)
183 eor v2.16b, v2.16b, v1.16b
184 SM4_CRYPT_BLK(v2)
185 eor v3.16b, v3.16b, v2.16b
186 SM4_CRYPT_BLK(v3)
187
188 st1 {v0.16b-v3.16b}, [x1], #64
189 mov RIV.16b, v3.16b
190
191 cbz w4, .Lcbc_enc_end
192 b .Lcbc_enc_loop_4x
193
194.Lcbc_enc_loop_1x:
195 sub w4, w4, #1
196
197 ld1 {v0.16b}, [x2], #16
198
199 eor RIV.16b, RIV.16b, v0.16b
200 SM4_CRYPT_BLK(RIV)
201
202 st1 {RIV.16b}, [x1], #16
203
204 cbnz w4, .Lcbc_enc_loop_1x
205
206.Lcbc_enc_end:
207 /* store new IV */
208 st1 {RIV.16b}, [x3]
209
210 ret
211SYM_FUNC_END(sm4_ce_cbc_enc)
212
213.align 3
214SYM_FUNC_START(sm4_ce_cbc_dec)
215 /* input:
216 * x0: round key array, CTX
217 * x1: dst
218 * x2: src
219 * x3: iv (big endian, 128 bit)
220 * w4: nblocks
221 */
222 SM4_PREPARE(x0)
223
224 ld1 {RIV.16b}, [x3]
225
226.Lcbc_dec_loop_8x:
227 sub w4, w4, #8
228 tbnz w4, #31, .Lcbc_dec_4x
229
230 ld1 {v0.16b-v3.16b}, [x2], #64
231 ld1 {v4.16b-v7.16b}, [x2], #64
232
233 rev32 v8.16b, v0.16b
234 rev32 v9.16b, v1.16b
235 rev32 v10.16b, v2.16b
236 rev32 v11.16b, v3.16b
237 rev32 v12.16b, v4.16b
238 rev32 v13.16b, v5.16b
239 rev32 v14.16b, v6.16b
240 rev32 v15.16b, v7.16b
241
242 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
243
244 eor v8.16b, v8.16b, RIV.16b
245 eor v9.16b, v9.16b, v0.16b
246 eor v10.16b, v10.16b, v1.16b
247 eor v11.16b, v11.16b, v2.16b
248 eor v12.16b, v12.16b, v3.16b
249 eor v13.16b, v13.16b, v4.16b
250 eor v14.16b, v14.16b, v5.16b
251 eor v15.16b, v15.16b, v6.16b
252
253 st1 {v8.16b-v11.16b}, [x1], #64
254 st1 {v12.16b-v15.16b}, [x1], #64
255
256 mov RIV.16b, v7.16b
257
258 cbz w4, .Lcbc_dec_end
259 b .Lcbc_dec_loop_8x
260
261.Lcbc_dec_4x:
262 add w4, w4, #8
263 cmp w4, #4
264 blt .Lcbc_dec_loop_1x
265
266 sub w4, w4, #4
267
268 ld1 {v0.16b-v3.16b}, [x2], #64
269
270 rev32 v8.16b, v0.16b
271 rev32 v9.16b, v1.16b
272 rev32 v10.16b, v2.16b
273 rev32 v11.16b, v3.16b
274
275 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
276
277 eor v8.16b, v8.16b, RIV.16b
278 eor v9.16b, v9.16b, v0.16b
279 eor v10.16b, v10.16b, v1.16b
280 eor v11.16b, v11.16b, v2.16b
281
282 st1 {v8.16b-v11.16b}, [x1], #64
283
284 mov RIV.16b, v3.16b
285
286 cbz w4, .Lcbc_dec_end
287
288.Lcbc_dec_loop_1x:
289 sub w4, w4, #1
290
291 ld1 {v0.16b}, [x2], #16
292
293 rev32 v8.16b, v0.16b
294
295 SM4_CRYPT_BLK_BE(v8)
296
297 eor v8.16b, v8.16b, RIV.16b
298 st1 {v8.16b}, [x1], #16
299
300 mov RIV.16b, v0.16b
301
302 cbnz w4, .Lcbc_dec_loop_1x
303
304.Lcbc_dec_end:
305 /* store new IV */
306 st1 {RIV.16b}, [x3]
307
308 ret
309SYM_FUNC_END(sm4_ce_cbc_dec)
310
311.align 3
312SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313 /* input:
314 * x0: round key array, CTX
315 * x1: dst
316 * x2: src
317 * x3: iv (big endian, 128 bit)
318 * w4: nbytes
319 */
320 SM4_PREPARE(x0)
321
322 sub w5, w4, #16
323 uxtw x5, w5
324
325 ld1 {RIV.16b}, [x3]
326
327 ld1 {v0.16b}, [x2]
328 eor RIV.16b, RIV.16b, v0.16b
329 SM4_CRYPT_BLK(RIV)
330
331 /* load permute table */
332 adr_l x6, .Lcts_permute_table
333 add x7, x6, #32
334 add x6, x6, x5
335 sub x7, x7, x5
336 ld1 {v3.16b}, [x6]
337 ld1 {v4.16b}, [x7]
338
339 /* overlapping loads */
340 add x2, x2, x5
341 ld1 {v1.16b}, [x2]
342
343 /* create Cn from En-1 */
344 tbl v0.16b, {RIV.16b}, v3.16b
345 /* padding Pn with zeros */
346 tbl v1.16b, {v1.16b}, v4.16b
347
348 eor v1.16b, v1.16b, RIV.16b
349 SM4_CRYPT_BLK(v1)
350
351 /* overlapping stores */
352 add x5, x1, x5
353 st1 {v0.16b}, [x5]
354 st1 {v1.16b}, [x1]
355
356 ret
357SYM_FUNC_END(sm4_ce_cbc_cts_enc)
358
359.align 3
360SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361 /* input:
362 * x0: round key array, CTX
363 * x1: dst
364 * x2: src
365 * x3: iv (big endian, 128 bit)
366 * w4: nbytes
367 */
368 SM4_PREPARE(x0)
369
370 sub w5, w4, #16
371 uxtw x5, w5
372
373 ld1 {RIV.16b}, [x3]
374
375 /* load permute table */
376 adr_l x6, .Lcts_permute_table
377 add x7, x6, #32
378 add x6, x6, x5
379 sub x7, x7, x5
380 ld1 {v3.16b}, [x6]
381 ld1 {v4.16b}, [x7]
382
383 /* overlapping loads */
384 ld1 {v0.16b}, [x2], x5
385 ld1 {v1.16b}, [x2]
386
387 SM4_CRYPT_BLK(v0)
388 /* select the first Ln bytes of Xn to create Pn */
389 tbl v2.16b, {v0.16b}, v3.16b
390 eor v2.16b, v2.16b, v1.16b
391
392 /* overwrite the first Ln bytes with Cn to create En-1 */
393 tbx v0.16b, {v1.16b}, v4.16b
394 SM4_CRYPT_BLK(v0)
395 eor v0.16b, v0.16b, RIV.16b
396
397 /* overlapping stores */
398 add x5, x1, x5
399 st1 {v2.16b}, [x5]
400 st1 {v0.16b}, [x1]
401
402 ret
403SYM_FUNC_END(sm4_ce_cbc_cts_dec)
404
405.align 3
406SYM_FUNC_START(sm4_ce_ctr_enc)
407 /* input:
408 * x0: round key array, CTX
409 * x1: dst
410 * x2: src
411 * x3: ctr (big endian, 128 bit)
412 * w4: nblocks
413 */
414 SM4_PREPARE(x0)
415
416 ldp x7, x8, [x3]
417 rev x7, x7
418 rev x8, x8
419
420.Lctr_loop_8x:
421 sub w4, w4, #8
422 tbnz w4, #31, .Lctr_4x
423
424#define inc_le128(vctr) \
425 mov vctr.d[1], x8; \
426 mov vctr.d[0], x7; \
427 adds x8, x8, #1; \
428 rev64 vctr.16b, vctr.16b; \
429 adc x7, x7, xzr;
430
431 /* construct CTRs */
432 inc_le128(v0) /* +0 */
433 inc_le128(v1) /* +1 */
434 inc_le128(v2) /* +2 */
435 inc_le128(v3) /* +3 */
436 inc_le128(v4) /* +4 */
437 inc_le128(v5) /* +5 */
438 inc_le128(v6) /* +6 */
439 inc_le128(v7) /* +7 */
440
441 ld1 {v8.16b-v11.16b}, [x2], #64
442 ld1 {v12.16b-v15.16b}, [x2], #64
443
444 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
445
446 eor v0.16b, v0.16b, v8.16b
447 eor v1.16b, v1.16b, v9.16b
448 eor v2.16b, v2.16b, v10.16b
449 eor v3.16b, v3.16b, v11.16b
450 eor v4.16b, v4.16b, v12.16b
451 eor v5.16b, v5.16b, v13.16b
452 eor v6.16b, v6.16b, v14.16b
453 eor v7.16b, v7.16b, v15.16b
454
455 st1 {v0.16b-v3.16b}, [x1], #64
456 st1 {v4.16b-v7.16b}, [x1], #64
457
458 cbz w4, .Lctr_end
459 b .Lctr_loop_8x
460
461.Lctr_4x:
462 add w4, w4, #8
463 cmp w4, #4
464 blt .Lctr_loop_1x
465
466 sub w4, w4, #4
467
468 /* construct CTRs */
469 inc_le128(v0) /* +0 */
470 inc_le128(v1) /* +1 */
471 inc_le128(v2) /* +2 */
472 inc_le128(v3) /* +3 */
473
474 ld1 {v8.16b-v11.16b}, [x2], #64
475
476 SM4_CRYPT_BLK4(v0, v1, v2, v3)
477
478 eor v0.16b, v0.16b, v8.16b
479 eor v1.16b, v1.16b, v9.16b
480 eor v2.16b, v2.16b, v10.16b
481 eor v3.16b, v3.16b, v11.16b
482
483 st1 {v0.16b-v3.16b}, [x1], #64
484
485 cbz w4, .Lctr_end
486
487.Lctr_loop_1x:
488 sub w4, w4, #1
489
490 /* construct CTRs */
491 inc_le128(v0)
492
493 ld1 {v8.16b}, [x2], #16
494
495 SM4_CRYPT_BLK(v0)
496
497 eor v0.16b, v0.16b, v8.16b
498 st1 {v0.16b}, [x1], #16
499
500 cbnz w4, .Lctr_loop_1x
501
502.Lctr_end:
503 /* store new CTR */
504 rev x7, x7
505 rev x8, x8
506 stp x7, x8, [x3]
507
508 ret
509SYM_FUNC_END(sm4_ce_ctr_enc)
510
511
512#define tweak_next(vt, vin, RTMP) \
513 sshr RTMP.2d, vin.2d, #63; \
514 and RTMP.16b, RTMP.16b, RMASK.16b; \
515 add vt.2d, vin.2d, vin.2d; \
516 ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
517 eor vt.16b, vt.16b, RTMP.16b;
518
519.align 3
520SYM_FUNC_START(sm4_ce_xts_enc)
521 /* input:
522 * x0: round key array, CTX
523 * x1: dst
524 * x2: src
525 * x3: tweak (big endian, 128 bit)
526 * w4: nbytes
527 * x5: round key array for IV
528 */
529 ld1 {v8.16b}, [x3]
530
531 cbz x5, .Lxts_enc_nofirst
532
533 SM4_PREPARE(x5)
534
535 /* Generate first tweak */
536 SM4_CRYPT_BLK(v8)
537
538.Lxts_enc_nofirst:
539 SM4_PREPARE(x0)
540
541 ands w5, w4, #15
542 lsr w4, w4, #4
543 sub w6, w4, #1
544 csel w4, w4, w6, eq
545 uxtw x5, w5
546
547 movi RMASK.2s, #0x1
548 movi RTMP0.2s, #0x87
549 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
550
551 cbz w4, .Lxts_enc_cts
552
553.Lxts_enc_loop_8x:
554 sub w4, w4, #8
555 tbnz w4, #31, .Lxts_enc_4x
556
557 tweak_next( v9, v8, RTMP0)
558 tweak_next(v10, v9, RTMP1)
559 tweak_next(v11, v10, RTMP2)
560 tweak_next(v12, v11, RTMP3)
561 tweak_next(v13, v12, RTMP0)
562 tweak_next(v14, v13, RTMP1)
563 tweak_next(v15, v14, RTMP2)
564
565 ld1 {v0.16b-v3.16b}, [x2], #64
566 ld1 {v4.16b-v7.16b}, [x2], #64
567 eor v0.16b, v0.16b, v8.16b
568 eor v1.16b, v1.16b, v9.16b
569 eor v2.16b, v2.16b, v10.16b
570 eor v3.16b, v3.16b, v11.16b
571 eor v4.16b, v4.16b, v12.16b
572 eor v5.16b, v5.16b, v13.16b
573 eor v6.16b, v6.16b, v14.16b
574 eor v7.16b, v7.16b, v15.16b
575
576 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
577
578 eor v0.16b, v0.16b, v8.16b
579 eor v1.16b, v1.16b, v9.16b
580 eor v2.16b, v2.16b, v10.16b
581 eor v3.16b, v3.16b, v11.16b
582 eor v4.16b, v4.16b, v12.16b
583 eor v5.16b, v5.16b, v13.16b
584 eor v6.16b, v6.16b, v14.16b
585 eor v7.16b, v7.16b, v15.16b
586 st1 {v0.16b-v3.16b}, [x1], #64
587 st1 {v4.16b-v7.16b}, [x1], #64
588
589 tweak_next(v8, v15, RTMP3)
590
591 cbz w4, .Lxts_enc_cts
592 b .Lxts_enc_loop_8x
593
594.Lxts_enc_4x:
595 add w4, w4, #8
596 cmp w4, #4
597 blt .Lxts_enc_loop_1x
598
599 sub w4, w4, #4
600
601 tweak_next( v9, v8, RTMP0)
602 tweak_next(v10, v9, RTMP1)
603 tweak_next(v11, v10, RTMP2)
604
605 ld1 {v0.16b-v3.16b}, [x2], #64
606 eor v0.16b, v0.16b, v8.16b
607 eor v1.16b, v1.16b, v9.16b
608 eor v2.16b, v2.16b, v10.16b
609 eor v3.16b, v3.16b, v11.16b
610
611 SM4_CRYPT_BLK4(v0, v1, v2, v3)
612
613 eor v0.16b, v0.16b, v8.16b
614 eor v1.16b, v1.16b, v9.16b
615 eor v2.16b, v2.16b, v10.16b
616 eor v3.16b, v3.16b, v11.16b
617 st1 {v0.16b-v3.16b}, [x1], #64
618
619 tweak_next(v8, v11, RTMP3)
620
621 cbz w4, .Lxts_enc_cts
622
623.Lxts_enc_loop_1x:
624 sub w4, w4, #1
625
626 ld1 {v0.16b}, [x2], #16
627 eor v0.16b, v0.16b, v8.16b
628
629 SM4_CRYPT_BLK(v0)
630
631 eor v0.16b, v0.16b, v8.16b
632 st1 {v0.16b}, [x1], #16
633
634 tweak_next(v8, v8, RTMP0)
635
636 cbnz w4, .Lxts_enc_loop_1x
637
638.Lxts_enc_cts:
639 cbz x5, .Lxts_enc_end
640
641 /* cipher text stealing */
642
643 tweak_next(v9, v8, RTMP0)
644 ld1 {v0.16b}, [x2]
645 eor v0.16b, v0.16b, v8.16b
646 SM4_CRYPT_BLK(v0)
647 eor v0.16b, v0.16b, v8.16b
648
649 /* load permute table */
650 adr_l x6, .Lcts_permute_table
651 add x7, x6, #32
652 add x6, x6, x5
653 sub x7, x7, x5
654 ld1 {v3.16b}, [x6]
655 ld1 {v4.16b}, [x7]
656
657 /* overlapping loads */
658 add x2, x2, x5
659 ld1 {v1.16b}, [x2]
660
661 /* create Cn from En-1 */
662 tbl v2.16b, {v0.16b}, v3.16b
663 /* padding Pn with En-1 at the end */
664 tbx v0.16b, {v1.16b}, v4.16b
665
666 eor v0.16b, v0.16b, v9.16b
667 SM4_CRYPT_BLK(v0)
668 eor v0.16b, v0.16b, v9.16b
669
670
671 /* overlapping stores */
672 add x5, x1, x5
673 st1 {v2.16b}, [x5]
674 st1 {v0.16b}, [x1]
675
676 b .Lxts_enc_ret
677
678.Lxts_enc_end:
679 /* store new tweak */
680 st1 {v8.16b}, [x3]
681
682.Lxts_enc_ret:
683 ret
684SYM_FUNC_END(sm4_ce_xts_enc)
685
686.align 3
687SYM_FUNC_START(sm4_ce_xts_dec)
688 /* input:
689 * x0: round key array, CTX
690 * x1: dst
691 * x2: src
692 * x3: tweak (big endian, 128 bit)
693 * w4: nbytes
694 * x5: round key array for IV
695 */
696 ld1 {v8.16b}, [x3]
697
698 cbz x5, .Lxts_dec_nofirst
699
700 SM4_PREPARE(x5)
701
702 /* Generate first tweak */
703 SM4_CRYPT_BLK(v8)
704
705.Lxts_dec_nofirst:
706 SM4_PREPARE(x0)
707
708 ands w5, w4, #15
709 lsr w4, w4, #4
710 sub w6, w4, #1
711 csel w4, w4, w6, eq
712 uxtw x5, w5
713
714 movi RMASK.2s, #0x1
715 movi RTMP0.2s, #0x87
716 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
717
718 cbz w4, .Lxts_dec_cts
719
720.Lxts_dec_loop_8x:
721 sub w4, w4, #8
722 tbnz w4, #31, .Lxts_dec_4x
723
724 tweak_next( v9, v8, RTMP0)
725 tweak_next(v10, v9, RTMP1)
726 tweak_next(v11, v10, RTMP2)
727 tweak_next(v12, v11, RTMP3)
728 tweak_next(v13, v12, RTMP0)
729 tweak_next(v14, v13, RTMP1)
730 tweak_next(v15, v14, RTMP2)
731
732 ld1 {v0.16b-v3.16b}, [x2], #64
733 ld1 {v4.16b-v7.16b}, [x2], #64
734 eor v0.16b, v0.16b, v8.16b
735 eor v1.16b, v1.16b, v9.16b
736 eor v2.16b, v2.16b, v10.16b
737 eor v3.16b, v3.16b, v11.16b
738 eor v4.16b, v4.16b, v12.16b
739 eor v5.16b, v5.16b, v13.16b
740 eor v6.16b, v6.16b, v14.16b
741 eor v7.16b, v7.16b, v15.16b
742
743 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
744
745 eor v0.16b, v0.16b, v8.16b
746 eor v1.16b, v1.16b, v9.16b
747 eor v2.16b, v2.16b, v10.16b
748 eor v3.16b, v3.16b, v11.16b
749 eor v4.16b, v4.16b, v12.16b
750 eor v5.16b, v5.16b, v13.16b
751 eor v6.16b, v6.16b, v14.16b
752 eor v7.16b, v7.16b, v15.16b
753 st1 {v0.16b-v3.16b}, [x1], #64
754 st1 {v4.16b-v7.16b}, [x1], #64
755
756 tweak_next(v8, v15, RTMP3)
757
758 cbz w4, .Lxts_dec_cts
759 b .Lxts_dec_loop_8x
760
761.Lxts_dec_4x:
762 add w4, w4, #8
763 cmp w4, #4
764 blt .Lxts_dec_loop_1x
765
766 sub w4, w4, #4
767
768 tweak_next( v9, v8, RTMP0)
769 tweak_next(v10, v9, RTMP1)
770 tweak_next(v11, v10, RTMP2)
771
772 ld1 {v0.16b-v3.16b}, [x2], #64
773 eor v0.16b, v0.16b, v8.16b
774 eor v1.16b, v1.16b, v9.16b
775 eor v2.16b, v2.16b, v10.16b
776 eor v3.16b, v3.16b, v11.16b
777
778 SM4_CRYPT_BLK4(v0, v1, v2, v3)
779
780 eor v0.16b, v0.16b, v8.16b
781 eor v1.16b, v1.16b, v9.16b
782 eor v2.16b, v2.16b, v10.16b
783 eor v3.16b, v3.16b, v11.16b
784 st1 {v0.16b-v3.16b}, [x1], #64
785
786 tweak_next(v8, v11, RTMP3)
787
788 cbz w4, .Lxts_dec_cts
789
790.Lxts_dec_loop_1x:
791 sub w4, w4, #1
792
793 ld1 {v0.16b}, [x2], #16
794 eor v0.16b, v0.16b, v8.16b
795
796 SM4_CRYPT_BLK(v0)
797
798 eor v0.16b, v0.16b, v8.16b
799 st1 {v0.16b}, [x1], #16
800
801 tweak_next(v8, v8, RTMP0)
802
803 cbnz w4, .Lxts_dec_loop_1x
804
805.Lxts_dec_cts:
806 cbz x5, .Lxts_dec_end
807
808 /* cipher text stealing */
809
810 tweak_next(v9, v8, RTMP0)
811 ld1 {v0.16b}, [x2]
812 eor v0.16b, v0.16b, v9.16b
813 SM4_CRYPT_BLK(v0)
814 eor v0.16b, v0.16b, v9.16b
815
816 /* load permute table */
817 adr_l x6, .Lcts_permute_table
818 add x7, x6, #32
819 add x6, x6, x5
820 sub x7, x7, x5
821 ld1 {v3.16b}, [x6]
822 ld1 {v4.16b}, [x7]
823
824 /* overlapping loads */
825 add x2, x2, x5
826 ld1 {v1.16b}, [x2]
827
828 /* create Cn from En-1 */
829 tbl v2.16b, {v0.16b}, v3.16b
830 /* padding Pn with En-1 at the end */
831 tbx v0.16b, {v1.16b}, v4.16b
832
833 eor v0.16b, v0.16b, v8.16b
834 SM4_CRYPT_BLK(v0)
835 eor v0.16b, v0.16b, v8.16b
836
837
838 /* overlapping stores */
839 add x5, x1, x5
840 st1 {v2.16b}, [x5]
841 st1 {v0.16b}, [x1]
842
843 b .Lxts_dec_ret
844
845.Lxts_dec_end:
846 /* store new tweak */
847 st1 {v8.16b}, [x3]
848
849.Lxts_dec_ret:
850 ret
851SYM_FUNC_END(sm4_ce_xts_dec)
852
853.align 3
854SYM_FUNC_START(sm4_ce_mac_update)
855 /* input:
856 * x0: round key array, CTX
857 * x1: digest
858 * x2: src
859 * w3: nblocks
860 * w4: enc_before
861 * w5: enc_after
862 */
863 SM4_PREPARE(x0)
864
865 ld1 {RMAC.16b}, [x1]
866
867 cbz w4, .Lmac_update
868
869 SM4_CRYPT_BLK(RMAC)
870
871.Lmac_update:
872 cbz w3, .Lmac_ret
873
874 sub w6, w3, #1
875 cmp w5, wzr
876 csel w3, w3, w6, ne
877
878 cbz w3, .Lmac_end
879
880.Lmac_loop_4x:
881 cmp w3, #4
882 blt .Lmac_loop_1x
883
884 sub w3, w3, #4
885
886 ld1 {v0.16b-v3.16b}, [x2], #64
887
888 eor RMAC.16b, RMAC.16b, v0.16b
889 SM4_CRYPT_BLK(RMAC)
890 eor RMAC.16b, RMAC.16b, v1.16b
891 SM4_CRYPT_BLK(RMAC)
892 eor RMAC.16b, RMAC.16b, v2.16b
893 SM4_CRYPT_BLK(RMAC)
894 eor RMAC.16b, RMAC.16b, v3.16b
895 SM4_CRYPT_BLK(RMAC)
896
897 cbz w3, .Lmac_end
898 b .Lmac_loop_4x
899
900.Lmac_loop_1x:
901 sub w3, w3, #1
902
903 ld1 {v0.16b}, [x2], #16
904
905 eor RMAC.16b, RMAC.16b, v0.16b
906 SM4_CRYPT_BLK(RMAC)
907
908 cbnz w3, .Lmac_loop_1x
909
910
911.Lmac_end:
912 cbnz w5, .Lmac_ret
913
914 ld1 {v0.16b}, [x2], #16
915 eor RMAC.16b, RMAC.16b, v0.16b
916
917.Lmac_ret:
918 st1 {RMAC.16b}, [x1]
919 ret
920SYM_FUNC_END(sm4_ce_mac_update)
921
922
923 .section ".rodata", "a"
924 .align 4
925.Lbswap128_mask:
926 .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
927 .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
928
929.Lcts_permute_table:
930 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
931 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
932 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
933 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
934 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
935 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
936

source code of linux/arch/arm64/crypto/sm4-ce-core.S