1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
4 * as specified in rfc8998
5 * https://datatracker.ietf.org/doc/html/rfc8998
6 *
7 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <linux/cfi_types.h>
13#include <asm/assembler.h>
14#include "sm4-ce-asm.h"
15
16.arch armv8-a+crypto
17
18.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
19 .set .Lv\b\().4s, \b
20.endr
21
22.macro sm4e, vd, vn
23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd
24.endm
25
26/* Register macros */
27
28/* Used for both encryption and decryption */
29#define RHASH v21
30#define RRCONST v22
31#define RZERO v23
32
33/* Helper macros. */
34
35/*
36 * input: m0, m1
37 * output: r0:r1 (low 128-bits in r0, high in r1)
38 */
39#define PMUL_128x128(r0, r1, m0, m1, T0, T1) \
40 ext T0.16b, m1.16b, m1.16b, #8; \
41 pmull r0.1q, m0.1d, m1.1d; \
42 pmull T1.1q, m0.1d, T0.1d; \
43 pmull2 T0.1q, m0.2d, T0.2d; \
44 pmull2 r1.1q, m0.2d, m1.2d; \
45 eor T0.16b, T0.16b, T1.16b; \
46 ext T1.16b, RZERO.16b, T0.16b, #8; \
47 ext T0.16b, T0.16b, RZERO.16b, #8; \
48 eor r0.16b, r0.16b, T1.16b; \
49 eor r1.16b, r1.16b, T0.16b;
50
51#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \
52 r2, r3, m2, m3, T2, T3, \
53 r4, r5, m4, m5, T4, T5, \
54 r6, r7, m6, m7, T6, T7) \
55 ext T0.16b, m1.16b, m1.16b, #8; \
56 ext T2.16b, m3.16b, m3.16b, #8; \
57 ext T4.16b, m5.16b, m5.16b, #8; \
58 ext T6.16b, m7.16b, m7.16b, #8; \
59 pmull r0.1q, m0.1d, m1.1d; \
60 pmull r2.1q, m2.1d, m3.1d; \
61 pmull r4.1q, m4.1d, m5.1d; \
62 pmull r6.1q, m6.1d, m7.1d; \
63 pmull T1.1q, m0.1d, T0.1d; \
64 pmull T3.1q, m2.1d, T2.1d; \
65 pmull T5.1q, m4.1d, T4.1d; \
66 pmull T7.1q, m6.1d, T6.1d; \
67 pmull2 T0.1q, m0.2d, T0.2d; \
68 pmull2 T2.1q, m2.2d, T2.2d; \
69 pmull2 T4.1q, m4.2d, T4.2d; \
70 pmull2 T6.1q, m6.2d, T6.2d; \
71 pmull2 r1.1q, m0.2d, m1.2d; \
72 pmull2 r3.1q, m2.2d, m3.2d; \
73 pmull2 r5.1q, m4.2d, m5.2d; \
74 pmull2 r7.1q, m6.2d, m7.2d; \
75 eor T0.16b, T0.16b, T1.16b; \
76 eor T2.16b, T2.16b, T3.16b; \
77 eor T4.16b, T4.16b, T5.16b; \
78 eor T6.16b, T6.16b, T7.16b; \
79 ext T1.16b, RZERO.16b, T0.16b, #8; \
80 ext T3.16b, RZERO.16b, T2.16b, #8; \
81 ext T5.16b, RZERO.16b, T4.16b, #8; \
82 ext T7.16b, RZERO.16b, T6.16b, #8; \
83 ext T0.16b, T0.16b, RZERO.16b, #8; \
84 ext T2.16b, T2.16b, RZERO.16b, #8; \
85 ext T4.16b, T4.16b, RZERO.16b, #8; \
86 ext T6.16b, T6.16b, RZERO.16b, #8; \
87 eor r0.16b, r0.16b, T1.16b; \
88 eor r2.16b, r2.16b, T3.16b; \
89 eor r4.16b, r4.16b, T5.16b; \
90 eor r6.16b, r6.16b, T7.16b; \
91 eor r1.16b, r1.16b, T0.16b; \
92 eor r3.16b, r3.16b, T2.16b; \
93 eor r5.16b, r5.16b, T4.16b; \
94 eor r7.16b, r7.16b, T6.16b;
95
96/*
97 * input: r0:r1 (low 128-bits in r0, high in r1)
98 * output: a
99 */
100#define REDUCTION(a, r0, r1, rconst, T0, T1) \
101 pmull2 T0.1q, r1.2d, rconst.2d; \
102 ext T1.16b, T0.16b, RZERO.16b, #8; \
103 ext T0.16b, RZERO.16b, T0.16b, #8; \
104 eor r1.16b, r1.16b, T1.16b; \
105 eor r0.16b, r0.16b, T0.16b; \
106 pmull T0.1q, r1.1d, rconst.1d; \
107 eor a.16b, r0.16b, T0.16b;
108
109#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \
110 rev32 b0.16b, b0.16b; \
111 ext T0.16b, m1.16b, m1.16b, #8; \
112 sm4e b0.4s, v24.4s; \
113 pmull r0.1q, m0.1d, m1.1d; \
114 sm4e b0.4s, v25.4s; \
115 pmull T1.1q, m0.1d, T0.1d; \
116 sm4e b0.4s, v26.4s; \
117 pmull2 T0.1q, m0.2d, T0.2d; \
118 sm4e b0.4s, v27.4s; \
119 pmull2 r1.1q, m0.2d, m1.2d; \
120 sm4e b0.4s, v28.4s; \
121 eor T0.16b, T0.16b, T1.16b; \
122 sm4e b0.4s, v29.4s; \
123 ext T1.16b, RZERO.16b, T0.16b, #8; \
124 sm4e b0.4s, v30.4s; \
125 ext T0.16b, T0.16b, RZERO.16b, #8; \
126 sm4e b0.4s, v31.4s; \
127 eor r0.16b, r0.16b, T1.16b; \
128 rev64 b0.4s, b0.4s; \
129 eor r1.16b, r1.16b, T0.16b; \
130 ext b0.16b, b0.16b, b0.16b, #8; \
131 rev32 b0.16b, b0.16b;
132
133#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \
134 r0, r1, m0, m1, T0, T1, \
135 r2, r3, m2, m3, T2, T3, \
136 r4, r5, m4, m5, T4, T5) \
137 rev32 b0.16b, b0.16b; \
138 rev32 b1.16b, b1.16b; \
139 rev32 b2.16b, b2.16b; \
140 ext T0.16b, m1.16b, m1.16b, #8; \
141 ext T2.16b, m3.16b, m3.16b, #8; \
142 ext T4.16b, m5.16b, m5.16b, #8; \
143 sm4e b0.4s, v24.4s; \
144 sm4e b1.4s, v24.4s; \
145 sm4e b2.4s, v24.4s; \
146 pmull r0.1q, m0.1d, m1.1d; \
147 pmull r2.1q, m2.1d, m3.1d; \
148 pmull r4.1q, m4.1d, m5.1d; \
149 sm4e b0.4s, v25.4s; \
150 sm4e b1.4s, v25.4s; \
151 sm4e b2.4s, v25.4s; \
152 pmull T1.1q, m0.1d, T0.1d; \
153 pmull T3.1q, m2.1d, T2.1d; \
154 pmull T5.1q, m4.1d, T4.1d; \
155 sm4e b0.4s, v26.4s; \
156 sm4e b1.4s, v26.4s; \
157 sm4e b2.4s, v26.4s; \
158 pmull2 T0.1q, m0.2d, T0.2d; \
159 pmull2 T2.1q, m2.2d, T2.2d; \
160 pmull2 T4.1q, m4.2d, T4.2d; \
161 sm4e b0.4s, v27.4s; \
162 sm4e b1.4s, v27.4s; \
163 sm4e b2.4s, v27.4s; \
164 pmull2 r1.1q, m0.2d, m1.2d; \
165 pmull2 r3.1q, m2.2d, m3.2d; \
166 pmull2 r5.1q, m4.2d, m5.2d; \
167 sm4e b0.4s, v28.4s; \
168 sm4e b1.4s, v28.4s; \
169 sm4e b2.4s, v28.4s; \
170 eor T0.16b, T0.16b, T1.16b; \
171 eor T2.16b, T2.16b, T3.16b; \
172 eor T4.16b, T4.16b, T5.16b; \
173 sm4e b0.4s, v29.4s; \
174 sm4e b1.4s, v29.4s; \
175 sm4e b2.4s, v29.4s; \
176 ext T1.16b, RZERO.16b, T0.16b, #8; \
177 ext T3.16b, RZERO.16b, T2.16b, #8; \
178 ext T5.16b, RZERO.16b, T4.16b, #8; \
179 sm4e b0.4s, v30.4s; \
180 sm4e b1.4s, v30.4s; \
181 sm4e b2.4s, v30.4s; \
182 ext T0.16b, T0.16b, RZERO.16b, #8; \
183 ext T2.16b, T2.16b, RZERO.16b, #8; \
184 ext T4.16b, T4.16b, RZERO.16b, #8; \
185 sm4e b0.4s, v31.4s; \
186 sm4e b1.4s, v31.4s; \
187 sm4e b2.4s, v31.4s; \
188 eor r0.16b, r0.16b, T1.16b; \
189 eor r2.16b, r2.16b, T3.16b; \
190 eor r4.16b, r4.16b, T5.16b; \
191 rev64 b0.4s, b0.4s; \
192 rev64 b1.4s, b1.4s; \
193 rev64 b2.4s, b2.4s; \
194 eor r1.16b, r1.16b, T0.16b; \
195 eor r3.16b, r3.16b, T2.16b; \
196 eor r5.16b, r5.16b, T4.16b; \
197 ext b0.16b, b0.16b, b0.16b, #8; \
198 ext b1.16b, b1.16b, b1.16b, #8; \
199 ext b2.16b, b2.16b, b2.16b, #8; \
200 eor r0.16b, r0.16b, r2.16b; \
201 eor r1.16b, r1.16b, r3.16b; \
202 rev32 b0.16b, b0.16b; \
203 rev32 b1.16b, b1.16b; \
204 rev32 b2.16b, b2.16b; \
205 eor r0.16b, r0.16b, r4.16b; \
206 eor r1.16b, r1.16b, r5.16b;
207
208#define inc32_le128(vctr) \
209 mov vctr.d[1], x9; \
210 add w6, w9, #1; \
211 mov vctr.d[0], x8; \
212 bfi x9, x6, #0, #32; \
213 rev64 vctr.16b, vctr.16b;
214
215#define GTAG_HASH_LENGTHS(vctr0, vlen) \
216 ld1 {vlen.16b}, [x7]; \
217 /* construct CTR0 */ \
218 /* the lower 32-bits of initial IV is always be32(1) */ \
219 mov x6, #0x1; \
220 bfi x9, x6, #0, #32; \
221 mov vctr0.d[0], x8; \
222 mov vctr0.d[1], x9; \
223 rbit vlen.16b, vlen.16b; \
224 rev64 vctr0.16b, vctr0.16b; \
225 /* authtag = GCTR(CTR0, GHASH) */ \
226 eor RHASH.16b, RHASH.16b, vlen.16b; \
227 SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
228 RTMP0, RTMP1); \
229 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \
230 rbit RHASH.16b, RHASH.16b; \
231 eor RHASH.16b, RHASH.16b, vctr0.16b;
232
233
234/* Register macros for encrypt and ghash */
235
236/* can be the same as input v0-v3 */
237#define RR1 v0
238#define RR3 v1
239#define RR5 v2
240#define RR7 v3
241
242#define RR0 v4
243#define RR2 v5
244#define RR4 v6
245#define RR6 v7
246
247#define RTMP0 v8
248#define RTMP1 v9
249#define RTMP2 v10
250#define RTMP3 v11
251#define RTMP4 v12
252#define RTMP5 v13
253#define RTMP6 v14
254#define RTMP7 v15
255
256#define RH1 v16
257#define RH2 v17
258#define RH3 v18
259#define RH4 v19
260
261.align 3
262SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
263 /* input:
264 * x0: round key array, CTX
265 * x1: ghash table
266 */
267 SM4_PREPARE(x0)
268
269 adr_l x2, .Lghash_rconst
270 ld1r {RRCONST.2d}, [x2]
271
272 eor RZERO.16b, RZERO.16b, RZERO.16b
273
274 /* H = E(K, 0^128) */
275 rev32 v0.16b, RZERO.16b
276 SM4_CRYPT_BLK_BE(v0)
277
278 /* H ^ 1 */
279 rbit RH1.16b, v0.16b
280
281 /* H ^ 2 */
282 PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
283 REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
284
285 /* H ^ 3 */
286 PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
287 REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
288
289 /* H ^ 4 */
290 PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
291 REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
292
293 st1 {RH1.16b-RH4.16b}, [x1]
294
295 ret
296SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
297
298.align 3
299SYM_FUNC_START(pmull_ghash_update)
300 /* input:
301 * x0: ghash table
302 * x1: ghash result
303 * x2: src
304 * w3: nblocks
305 */
306 ld1 {RH1.16b-RH4.16b}, [x0]
307
308 ld1 {RHASH.16b}, [x1]
309 rbit RHASH.16b, RHASH.16b
310
311 adr_l x4, .Lghash_rconst
312 ld1r {RRCONST.2d}, [x4]
313
314 eor RZERO.16b, RZERO.16b, RZERO.16b
315
316.Lghash_loop_4x:
317 cmp w3, #4
318 blt .Lghash_loop_1x
319
320 sub w3, w3, #4
321
322 ld1 {v0.16b-v3.16b}, [x2], #64
323
324 rbit v0.16b, v0.16b
325 rbit v1.16b, v1.16b
326 rbit v2.16b, v2.16b
327 rbit v3.16b, v3.16b
328
329 /*
330 * (in0 ^ HASH) * H^4 => rr0:rr1
331 * (in1) * H^3 => rr2:rr3
332 * (in2) * H^2 => rr4:rr5
333 * (in3) * H^1 => rr6:rr7
334 */
335 eor RHASH.16b, RHASH.16b, v0.16b
336
337 PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
338 RR2, RR3, v1, RH3, RTMP2, RTMP3,
339 RR4, RR5, v2, RH2, RTMP4, RTMP5,
340 RR6, RR7, v3, RH1, RTMP6, RTMP7)
341
342 eor RR0.16b, RR0.16b, RR2.16b
343 eor RR1.16b, RR1.16b, RR3.16b
344 eor RR0.16b, RR0.16b, RR4.16b
345 eor RR1.16b, RR1.16b, RR5.16b
346 eor RR0.16b, RR0.16b, RR6.16b
347 eor RR1.16b, RR1.16b, RR7.16b
348
349 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
350
351 cbz w3, .Lghash_end
352 b .Lghash_loop_4x
353
354.Lghash_loop_1x:
355 sub w3, w3, #1
356
357 ld1 {v0.16b}, [x2], #16
358 rbit v0.16b, v0.16b
359 eor RHASH.16b, RHASH.16b, v0.16b
360
361 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
362 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
363
364 cbnz w3, .Lghash_loop_1x
365
366.Lghash_end:
367 rbit RHASH.16b, RHASH.16b
368 st1 {RHASH.2d}, [x1]
369
370 ret
371SYM_FUNC_END(pmull_ghash_update)
372
373.align 3
374SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
375 /* input:
376 * x0: round key array, CTX
377 * x1: dst
378 * x2: src
379 * x3: ctr (big endian, 128 bit)
380 * w4: nbytes
381 * x5: ghash result
382 * x6: ghash table
383 * x7: lengths (only for last block)
384 */
385 SM4_PREPARE(x0)
386
387 ldp x8, x9, [x3]
388 rev x8, x8
389 rev x9, x9
390
391 ld1 {RH1.16b-RH4.16b}, [x6]
392
393 ld1 {RHASH.16b}, [x5]
394 rbit RHASH.16b, RHASH.16b
395
396 adr_l x6, .Lghash_rconst
397 ld1r {RRCONST.2d}, [x6]
398
399 eor RZERO.16b, RZERO.16b, RZERO.16b
400
401 cbz w4, .Lgcm_enc_hash_len
402
403.Lgcm_enc_loop_4x:
404 cmp w4, #(4 * 16)
405 blt .Lgcm_enc_loop_1x
406
407 sub w4, w4, #(4 * 16)
408
409 /* construct CTRs */
410 inc32_le128(v0) /* +0 */
411 inc32_le128(v1) /* +1 */
412 inc32_le128(v2) /* +2 */
413 inc32_le128(v3) /* +3 */
414
415 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
416
417 SM4_CRYPT_BLK4(v0, v1, v2, v3)
418
419 eor v0.16b, v0.16b, RTMP0.16b
420 eor v1.16b, v1.16b, RTMP1.16b
421 eor v2.16b, v2.16b, RTMP2.16b
422 eor v3.16b, v3.16b, RTMP3.16b
423 st1 {v0.16b-v3.16b}, [x1], #64
424
425 /* ghash update */
426
427 rbit v0.16b, v0.16b
428 rbit v1.16b, v1.16b
429 rbit v2.16b, v2.16b
430 rbit v3.16b, v3.16b
431
432 /*
433 * (in0 ^ HASH) * H^4 => rr0:rr1
434 * (in1) * H^3 => rr2:rr3
435 * (in2) * H^2 => rr4:rr5
436 * (in3) * H^1 => rr6:rr7
437 */
438 eor RHASH.16b, RHASH.16b, v0.16b
439
440 PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
441 RR2, RR3, v1, RH3, RTMP2, RTMP3,
442 RR4, RR5, v2, RH2, RTMP4, RTMP5,
443 RR6, RR7, v3, RH1, RTMP6, RTMP7)
444
445 eor RR0.16b, RR0.16b, RR2.16b
446 eor RR1.16b, RR1.16b, RR3.16b
447 eor RR0.16b, RR0.16b, RR4.16b
448 eor RR1.16b, RR1.16b, RR5.16b
449 eor RR0.16b, RR0.16b, RR6.16b
450 eor RR1.16b, RR1.16b, RR7.16b
451
452 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
453
454 cbz w4, .Lgcm_enc_hash_len
455 b .Lgcm_enc_loop_4x
456
457.Lgcm_enc_loop_1x:
458 cmp w4, #16
459 blt .Lgcm_enc_tail
460
461 sub w4, w4, #16
462
463 /* construct CTRs */
464 inc32_le128(v0)
465
466 ld1 {RTMP0.16b}, [x2], #16
467
468 SM4_CRYPT_BLK(v0)
469
470 eor v0.16b, v0.16b, RTMP0.16b
471 st1 {v0.16b}, [x1], #16
472
473 /* ghash update */
474 rbit v0.16b, v0.16b
475 eor RHASH.16b, RHASH.16b, v0.16b
476 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
477 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
478
479 cbz w4, .Lgcm_enc_hash_len
480 b .Lgcm_enc_loop_1x
481
482.Lgcm_enc_tail:
483 /* construct CTRs */
484 inc32_le128(v0)
485 SM4_CRYPT_BLK(v0)
486
487 /* load permute table */
488 adr_l x0, .Lcts_permute_table
489 add x0, x0, #32
490 sub x0, x0, w4, uxtw
491 ld1 {v3.16b}, [x0]
492
493.Lgcm_enc_tail_loop:
494 /* do encrypt */
495 ldrb w0, [x2], #1 /* get 1 byte from input */
496 umov w6, v0.b[0] /* get top crypted byte */
497 eor w6, w6, w0 /* w6 = CTR ^ input */
498 strb w6, [x1], #1 /* store out byte */
499
500 /* shift right out one byte */
501 ext v0.16b, v0.16b, v0.16b, #1
502 /* the last ciphertext is placed in high bytes */
503 ins v0.b[15], w6
504
505 subs w4, w4, #1
506 bne .Lgcm_enc_tail_loop
507
508 /* padding last block with zeros */
509 tbl v0.16b, {v0.16b}, v3.16b
510
511 /* ghash update */
512 rbit v0.16b, v0.16b
513 eor RHASH.16b, RHASH.16b, v0.16b
514 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
515 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
516
517.Lgcm_enc_hash_len:
518 cbz x7, .Lgcm_enc_end
519
520 GTAG_HASH_LENGTHS(v1, v3)
521
522 b .Lgcm_enc_ret
523
524.Lgcm_enc_end:
525 /* store new CTR */
526 rev x8, x8
527 rev x9, x9
528 stp x8, x9, [x3]
529
530 rbit RHASH.16b, RHASH.16b
531
532.Lgcm_enc_ret:
533 /* store new MAC */
534 st1 {RHASH.2d}, [x5]
535
536 ret
537SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
538
539#undef RR1
540#undef RR3
541#undef RR5
542#undef RR7
543#undef RR0
544#undef RR2
545#undef RR4
546#undef RR6
547#undef RTMP0
548#undef RTMP1
549#undef RTMP2
550#undef RTMP3
551#undef RTMP4
552#undef RTMP5
553#undef RTMP6
554#undef RTMP7
555#undef RH1
556#undef RH2
557#undef RH3
558#undef RH4
559
560
561/* Register macros for decrypt */
562
563/* v0-v2 for building CTRs, v3-v5 for saving inputs */
564
565#define RR1 v6
566#define RR3 v7
567#define RR5 v8
568
569#define RR0 v9
570#define RR2 v10
571#define RR4 v11
572
573#define RTMP0 v12
574#define RTMP1 v13
575#define RTMP2 v14
576#define RTMP3 v15
577#define RTMP4 v16
578#define RTMP5 v17
579
580#define RH1 v18
581#define RH2 v19
582#define RH3 v20
583
584.align 3
585SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
586 /* input:
587 * x0: round key array, CTX
588 * x1: dst
589 * x2: src
590 * x3: ctr (big endian, 128 bit)
591 * w4: nbytes
592 * x5: ghash result
593 * x6: ghash table
594 * x7: lengths (only for last block)
595 */
596 SM4_PREPARE(x0)
597
598 ldp x8, x9, [x3]
599 rev x8, x8
600 rev x9, x9
601
602 ld1 {RH1.16b-RH3.16b}, [x6]
603
604 ld1 {RHASH.16b}, [x5]
605 rbit RHASH.16b, RHASH.16b
606
607 adr_l x6, .Lghash_rconst
608 ld1r {RRCONST.2d}, [x6]
609
610 eor RZERO.16b, RZERO.16b, RZERO.16b
611
612 cbz w4, .Lgcm_dec_hash_len
613
614.Lgcm_dec_loop_3x:
615 cmp w4, #(3 * 16)
616 blt .Lgcm_dec_loop_1x
617
618 sub w4, w4, #(3 * 16)
619
620 ld1 {v3.16b-v5.16b}, [x2], #(3 * 16)
621
622 /* construct CTRs */
623 inc32_le128(v0) /* +0 */
624 rbit v6.16b, v3.16b
625 inc32_le128(v1) /* +1 */
626 rbit v7.16b, v4.16b
627 inc32_le128(v2) /* +2 */
628 rbit v8.16b, v5.16b
629
630 eor RHASH.16b, RHASH.16b, v6.16b
631
632 /* decrypt & ghash update */
633 SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
634 RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
635 RR2, RR3, v7, RH2, RTMP2, RTMP3,
636 RR4, RR5, v8, RH1, RTMP4, RTMP5)
637
638 eor v0.16b, v0.16b, v3.16b
639 eor v1.16b, v1.16b, v4.16b
640 eor v2.16b, v2.16b, v5.16b
641
642 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
643
644 st1 {v0.16b-v2.16b}, [x1], #(3 * 16)
645
646 cbz w4, .Lgcm_dec_hash_len
647 b .Lgcm_dec_loop_3x
648
649.Lgcm_dec_loop_1x:
650 cmp w4, #16
651 blt .Lgcm_dec_tail
652
653 sub w4, w4, #16
654
655 ld1 {v3.16b}, [x2], #16
656
657 /* construct CTRs */
658 inc32_le128(v0)
659 rbit v6.16b, v3.16b
660
661 eor RHASH.16b, RHASH.16b, v6.16b
662
663 SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
664
665 eor v0.16b, v0.16b, v3.16b
666
667 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
668
669 st1 {v0.16b}, [x1], #16
670
671 cbz w4, .Lgcm_dec_hash_len
672 b .Lgcm_dec_loop_1x
673
674.Lgcm_dec_tail:
675 /* construct CTRs */
676 inc32_le128(v0)
677 SM4_CRYPT_BLK(v0)
678
679 /* load permute table */
680 adr_l x0, .Lcts_permute_table
681 add x0, x0, #32
682 sub x0, x0, w4, uxtw
683 ld1 {v3.16b}, [x0]
684
685.Lgcm_dec_tail_loop:
686 /* do decrypt */
687 ldrb w0, [x2], #1 /* get 1 byte from input */
688 umov w6, v0.b[0] /* get top crypted byte */
689 eor w6, w6, w0 /* w6 = CTR ^ input */
690 strb w6, [x1], #1 /* store out byte */
691
692 /* shift right out one byte */
693 ext v0.16b, v0.16b, v0.16b, #1
694 /* the last ciphertext is placed in high bytes */
695 ins v0.b[15], w0
696
697 subs w4, w4, #1
698 bne .Lgcm_dec_tail_loop
699
700 /* padding last block with zeros */
701 tbl v0.16b, {v0.16b}, v3.16b
702
703 /* ghash update */
704 rbit v0.16b, v0.16b
705 eor RHASH.16b, RHASH.16b, v0.16b
706 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
707 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
708
709.Lgcm_dec_hash_len:
710 cbz x7, .Lgcm_dec_end
711
712 GTAG_HASH_LENGTHS(v1, v3)
713
714 b .Lgcm_dec_ret
715
716.Lgcm_dec_end:
717 /* store new CTR */
718 rev x8, x8
719 rev x9, x9
720 stp x8, x9, [x3]
721
722 rbit RHASH.16b, RHASH.16b
723
724.Lgcm_dec_ret:
725 /* store new MAC */
726 st1 {RHASH.2d}, [x5]
727
728 ret
729SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
730
731 .section ".rodata", "a"
732 .align 4
733.Lcts_permute_table:
734 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
735 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
736 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
737 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
738 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
739 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
740
741.Lghash_rconst:
742 .quad 0x87
743

source code of linux/arch/arm64/crypto/sm4-ce-gcm-core.S