1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 *
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <linux/cfi_types.h>
10#include <asm/assembler.h>
11
12 SHASH .req v0
13 SHASH2 .req v1
14 T1 .req v2
15 T2 .req v3
16 MASK .req v4
17 XM .req v5
18 XL .req v6
19 XH .req v7
20 IN1 .req v7
21
22 k00_16 .req v8
23 k32_48 .req v9
24
25 t3 .req v10
26 t4 .req v11
27 t5 .req v12
28 t6 .req v13
29 t7 .req v14
30 t8 .req v15
31 t9 .req v16
32
33 perm1 .req v17
34 perm2 .req v18
35 perm3 .req v19
36
37 sh1 .req v20
38 sh2 .req v21
39 sh3 .req v22
40 sh4 .req v23
41
42 ss1 .req v24
43 ss2 .req v25
44 ss3 .req v26
45 ss4 .req v27
46
47 XL2 .req v8
48 XM2 .req v9
49 XH2 .req v10
50 XL3 .req v11
51 XM3 .req v12
52 XH3 .req v13
53 TT3 .req v14
54 TT4 .req v15
55 HH .req v16
56 HH3 .req v17
57 HH4 .req v18
58 HH34 .req v19
59
60 .text
61 .arch armv8-a+crypto
62
63 .macro __pmull_p64, rd, rn, rm
64 pmull \rd\().1q, \rn\().1d, \rm\().1d
65 .endm
66
67 .macro __pmull2_p64, rd, rn, rm
68 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
69 .endm
70
71 .macro __pmull_p8, rq, ad, bd
72 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
73 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
74 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
75
76 __pmull_p8_\bd \rq, \ad
77 .endm
78
79 .macro __pmull2_p8, rq, ad, bd
80 tbl t3.16b, {\ad\().16b}, perm1.16b // A1
81 tbl t5.16b, {\ad\().16b}, perm2.16b // A2
82 tbl t7.16b, {\ad\().16b}, perm3.16b // A3
83
84 __pmull2_p8_\bd \rq, \ad
85 .endm
86
87 .macro __pmull_p8_SHASH, rq, ad
88 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
89 .endm
90
91 .macro __pmull_p8_SHASH2, rq, ad
92 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
93 .endm
94
95 .macro __pmull2_p8_SHASH, rq, ad
96 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
97 .endm
98
99 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100 pmull\t t3.8h, t3.\nb, \bd // F = A1*B
101 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
102 pmull\t t5.8h, t5.\nb, \bd // H = A2*B
103 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
104 pmull\t t7.8h, t7.\nb, \bd // J = A3*B
105 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
106 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
107 pmull\t \rq\().8h, \ad, \bd // D = A*B
108
109 eor t3.16b, t3.16b, t4.16b // L = E + F
110 eor t5.16b, t5.16b, t6.16b // M = G + H
111 eor t7.16b, t7.16b, t8.16b // N = I + J
112
113 uzp1 t4.2d, t3.2d, t5.2d
114 uzp2 t3.2d, t3.2d, t5.2d
115 uzp1 t6.2d, t7.2d, t9.2d
116 uzp2 t7.2d, t7.2d, t9.2d
117
118 // t3 = (L) (P0 + P1) << 8
119 // t5 = (M) (P2 + P3) << 16
120 eor t4.16b, t4.16b, t3.16b
121 and t3.16b, t3.16b, k32_48.16b
122
123 // t7 = (N) (P4 + P5) << 24
124 // t9 = (K) (P6 + P7) << 32
125 eor t6.16b, t6.16b, t7.16b
126 and t7.16b, t7.16b, k00_16.16b
127
128 eor t4.16b, t4.16b, t3.16b
129 eor t6.16b, t6.16b, t7.16b
130
131 zip2 t5.2d, t4.2d, t3.2d
132 zip1 t3.2d, t4.2d, t3.2d
133 zip2 t9.2d, t6.2d, t7.2d
134 zip1 t7.2d, t6.2d, t7.2d
135
136 ext t3.16b, t3.16b, t3.16b, #15
137 ext t5.16b, t5.16b, t5.16b, #14
138 ext t7.16b, t7.16b, t7.16b, #13
139 ext t9.16b, t9.16b, t9.16b, #12
140
141 eor t3.16b, t3.16b, t5.16b
142 eor t7.16b, t7.16b, t9.16b
143 eor \rq\().16b, \rq\().16b, t3.16b
144 eor \rq\().16b, \rq\().16b, t7.16b
145 .endm
146
147 .macro __pmull_pre_p64
148 add x8, x3, #16
149 ld1 {HH.2d-HH4.2d}, [x8]
150
151 trn1 SHASH2.2d, SHASH.2d, HH.2d
152 trn2 T1.2d, SHASH.2d, HH.2d
153 eor SHASH2.16b, SHASH2.16b, T1.16b
154
155 trn1 HH34.2d, HH3.2d, HH4.2d
156 trn2 T1.2d, HH3.2d, HH4.2d
157 eor HH34.16b, HH34.16b, T1.16b
158
159 movi MASK.16b, #0xe1
160 shl MASK.2d, MASK.2d, #57
161 .endm
162
163 .macro __pmull_pre_p8
164 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
165 eor SHASH2.16b, SHASH2.16b, SHASH.16b
166
167 // k00_16 := 0x0000000000000000_000000000000ffff
168 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
169 movi k32_48.2d, #0xffffffff
170 mov k32_48.h[2], k32_48.h[0]
171 ushr k00_16.2d, k32_48.2d, #32
172
173 // prepare the permutation vectors
174 mov_q x5, 0x080f0e0d0c0b0a09
175 movi T1.8b, #8
176 dup perm1.2d, x5
177 eor perm1.16b, perm1.16b, T1.16b
178 ushr perm2.2d, perm1.2d, #8
179 ushr perm3.2d, perm1.2d, #16
180 ushr T1.2d, perm1.2d, #24
181 sli perm2.2d, perm1.2d, #56
182 sli perm3.2d, perm1.2d, #48
183 sli T1.2d, perm1.2d, #40
184
185 // precompute loop invariants
186 tbl sh1.16b, {SHASH.16b}, perm1.16b
187 tbl sh2.16b, {SHASH.16b}, perm2.16b
188 tbl sh3.16b, {SHASH.16b}, perm3.16b
189 tbl sh4.16b, {SHASH.16b}, T1.16b
190 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
191 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
192 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
193 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
194 .endm
195
196 //
197 // PMULL (64x64->128) based reduction for CPUs that can do
198 // it in a single instruction.
199 //
200 .macro __pmull_reduce_p64
201 pmull T2.1q, XL.1d, MASK.1d
202 eor XM.16b, XM.16b, T1.16b
203
204 mov XH.d[0], XM.d[1]
205 mov XM.d[1], XL.d[0]
206
207 eor XL.16b, XM.16b, T2.16b
208 ext T2.16b, XL.16b, XL.16b, #8
209 pmull XL.1q, XL.1d, MASK.1d
210 .endm
211
212 //
213 // Alternative reduction for CPUs that lack support for the
214 // 64x64->128 PMULL instruction
215 //
216 .macro __pmull_reduce_p8
217 eor XM.16b, XM.16b, T1.16b
218
219 mov XL.d[1], XM.d[0]
220 mov XH.d[0], XM.d[1]
221
222 shl T1.2d, XL.2d, #57
223 shl T2.2d, XL.2d, #62
224 eor T2.16b, T2.16b, T1.16b
225 shl T1.2d, XL.2d, #63
226 eor T2.16b, T2.16b, T1.16b
227 ext T1.16b, XL.16b, XH.16b, #8
228 eor T2.16b, T2.16b, T1.16b
229
230 mov XL.d[1], T2.d[0]
231 mov XH.d[0], T2.d[1]
232
233 ushr T2.2d, XL.2d, #1
234 eor XH.16b, XH.16b, XL.16b
235 eor XL.16b, XL.16b, T2.16b
236 ushr T2.2d, T2.2d, #6
237 ushr XL.2d, XL.2d, #1
238 .endm
239
240 .macro __pmull_ghash, pn
241 ld1 {SHASH.2d}, [x3]
242 ld1 {XL.2d}, [x1]
243
244 __pmull_pre_\pn
245
246 /* do the head block first, if supplied */
247 cbz x4, 0f
248 ld1 {T1.2d}, [x4]
249 mov x4, xzr
250 b 3f
251
2520: .ifc \pn, p64
253 tbnz w0, #0, 2f // skip until #blocks is a
254 tbnz w0, #1, 2f // round multiple of 4
255
2561: ld1 {XM3.16b-TT4.16b}, [x2], #64
257
258 sub w0, w0, #4
259
260 rev64 T1.16b, XM3.16b
261 rev64 T2.16b, XH3.16b
262 rev64 TT4.16b, TT4.16b
263 rev64 TT3.16b, TT3.16b
264
265 ext IN1.16b, TT4.16b, TT4.16b, #8
266 ext XL3.16b, TT3.16b, TT3.16b, #8
267
268 eor TT4.16b, TT4.16b, IN1.16b
269 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
270 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
271 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
272
273 eor TT3.16b, TT3.16b, XL3.16b
274 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
275 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
276 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
277
278 ext IN1.16b, T2.16b, T2.16b, #8
279 eor XL2.16b, XL2.16b, XL3.16b
280 eor XH2.16b, XH2.16b, XH3.16b
281 eor XM2.16b, XM2.16b, XM3.16b
282
283 eor T2.16b, T2.16b, IN1.16b
284 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
285 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
286 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
287
288 eor XL2.16b, XL2.16b, XL3.16b
289 eor XH2.16b, XH2.16b, XH3.16b
290 eor XM2.16b, XM2.16b, XM3.16b
291
292 ext IN1.16b, T1.16b, T1.16b, #8
293 ext TT3.16b, XL.16b, XL.16b, #8
294 eor XL.16b, XL.16b, IN1.16b
295 eor T1.16b, T1.16b, TT3.16b
296
297 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
298 eor T1.16b, T1.16b, XL.16b
299 pmull XL.1q, HH4.1d, XL.1d // a0 * b0
300 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
301
302 eor XL.16b, XL.16b, XL2.16b
303 eor XH.16b, XH.16b, XH2.16b
304 eor XM.16b, XM.16b, XM2.16b
305
306 eor T2.16b, XL.16b, XH.16b
307 ext T1.16b, XL.16b, XH.16b, #8
308 eor XM.16b, XM.16b, T2.16b
309
310 __pmull_reduce_p64
311
312 eor T2.16b, T2.16b, XH.16b
313 eor XL.16b, XL.16b, T2.16b
314
315 cbz w0, 5f
316 b 1b
317 .endif
318
3192: ld1 {T1.2d}, [x2], #16
320 sub w0, w0, #1
321
3223: /* multiply XL by SHASH in GF(2^128) */
323CPU_LE( rev64 T1.16b, T1.16b )
324
325 ext T2.16b, XL.16b, XL.16b, #8
326 ext IN1.16b, T1.16b, T1.16b, #8
327 eor T1.16b, T1.16b, T2.16b
328 eor XL.16b, XL.16b, IN1.16b
329
330 __pmull2_\pn XH, XL, SHASH // a1 * b1
331 eor T1.16b, T1.16b, XL.16b
332 __pmull_\pn XL, XL, SHASH // a0 * b0
333 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
334
3354: eor T2.16b, XL.16b, XH.16b
336 ext T1.16b, XL.16b, XH.16b, #8
337 eor XM.16b, XM.16b, T2.16b
338
339 __pmull_reduce_\pn
340
341 eor T2.16b, T2.16b, XH.16b
342 eor XL.16b, XL.16b, T2.16b
343
344 cbnz w0, 0b
345
3465: st1 {XL.2d}, [x1]
347 ret
348 .endm
349
350 /*
351 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352 * struct ghash_key const *k, const char *head)
353 */
354SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355 __pmull_ghash p64
356SYM_FUNC_END(pmull_ghash_update_p64)
357
358SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359 __pmull_ghash p8
360SYM_FUNC_END(pmull_ghash_update_p8)
361
362 KS0 .req v8
363 KS1 .req v9
364 KS2 .req v10
365 KS3 .req v11
366
367 INP0 .req v21
368 INP1 .req v22
369 INP2 .req v23
370 INP3 .req v24
371
372 K0 .req v25
373 K1 .req v26
374 K2 .req v27
375 K3 .req v28
376 K4 .req v12
377 K5 .req v13
378 K6 .req v4
379 K7 .req v5
380 K8 .req v14
381 K9 .req v15
382 KK .req v29
383 KL .req v30
384 KM .req v31
385
386 .macro load_round_keys, rounds, rk, tmp
387 add \tmp, \rk, #64
388 ld1 {K0.4s-K3.4s}, [\rk]
389 ld1 {K4.4s-K5.4s}, [\tmp]
390 add \tmp, \rk, \rounds, lsl #4
391 sub \tmp, \tmp, #32
392 ld1 {KK.4s-KM.4s}, [\tmp]
393 .endm
394
395 .macro enc_round, state, key
396 aese \state\().16b, \key\().16b
397 aesmc \state\().16b, \state\().16b
398 .endm
399
400 .macro enc_qround, s0, s1, s2, s3, key
401 enc_round \s0, \key
402 enc_round \s1, \key
403 enc_round \s2, \key
404 enc_round \s3, \key
405 .endm
406
407 .macro enc_block, state, rounds, rk, tmp
408 add \tmp, \rk, #96
409 ld1 {K6.4s-K7.4s}, [\tmp], #32
410 .irp key, K0, K1, K2, K3, K4 K5
411 enc_round \state, \key
412 .endr
413
414 tbnz \rounds, #2, .Lnot128_\@
415.Lout256_\@:
416 enc_round \state, K6
417 enc_round \state, K7
418
419.Lout192_\@:
420 enc_round \state, KK
421 aese \state\().16b, KL.16b
422 eor \state\().16b, \state\().16b, KM.16b
423
424 .subsection 1
425.Lnot128_\@:
426 ld1 {K8.4s-K9.4s}, [\tmp], #32
427 enc_round \state, K6
428 enc_round \state, K7
429 ld1 {K6.4s-K7.4s}, [\tmp]
430 enc_round \state, K8
431 enc_round \state, K9
432 tbz \rounds, #1, .Lout192_\@
433 b .Lout256_\@
434 .previous
435 .endm
436
437 .align 6
438 .macro pmull_gcm_do_crypt, enc
439 frame_push 1
440
441 load_round_keys x7, x6, x8
442
443 ld1 {SHASH.2d}, [x3], #16
444 ld1 {HH.2d-HH4.2d}, [x3]
445
446 trn1 SHASH2.2d, SHASH.2d, HH.2d
447 trn2 T1.2d, SHASH.2d, HH.2d
448 eor SHASH2.16b, SHASH2.16b, T1.16b
449
450 trn1 HH34.2d, HH3.2d, HH4.2d
451 trn2 T1.2d, HH3.2d, HH4.2d
452 eor HH34.16b, HH34.16b, T1.16b
453
454 ld1 {XL.2d}, [x4]
455
456 cbz x0, 3f // tag only?
457
458 ldr w8, [x5, #12] // load lower counter
459CPU_LE( rev w8, w8 )
460
4610: mov w9, #4 // max blocks per round
462 add x10, x0, #0xf
463 lsr x10, x10, #4 // remaining blocks
464
465 subs x0, x0, #64
466 csel w9, w10, w9, mi
467 add w8, w8, w9
468
469 bmi 1f
470 ld1 {INP0.16b-INP3.16b}, [x2], #64
471 .subsection 1
472 /*
473 * Populate the four input registers right to left with up to 63 bytes
474 * of data, using overlapping loads to avoid branches.
475 *
476 * INP0 INP1 INP2 INP3
477 * 1 byte | | | |x |
478 * 16 bytes | | | |xxxxxxxx|
479 * 17 bytes | | |xxxxxxxx|x |
480 * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
481 * etc etc
482 *
483 * Note that this code may read up to 15 bytes before the start of
484 * the input. It is up to the calling code to ensure this is safe if
485 * this happens in the first iteration of the loop (i.e., when the
486 * input size is < 16 bytes)
487 */
4881: mov x15, #16
489 ands x19, x0, #0xf
490 csel x19, x19, x15, ne
491 adr_l x17, .Lpermute_table + 16
492
493 sub x11, x15, x19
494 add x12, x17, x11
495 sub x17, x17, x11
496 ld1 {T1.16b}, [x12]
497 sub x10, x1, x11
498 sub x11, x2, x11
499
500 cmp x0, #-16
501 csel x14, x15, xzr, gt
502 cmp x0, #-32
503 csel x15, x15, xzr, gt
504 cmp x0, #-48
505 csel x16, x19, xzr, gt
506 csel x1, x1, x10, gt
507 csel x2, x2, x11, gt
508
509 ld1 {INP0.16b}, [x2], x14
510 ld1 {INP1.16b}, [x2], x15
511 ld1 {INP2.16b}, [x2], x16
512 ld1 {INP3.16b}, [x2]
513 tbl INP3.16b, {INP3.16b}, T1.16b
514 b 2f
515 .previous
516
5172: .if \enc == 0
518 bl pmull_gcm_ghash_4x
519 .endif
520
521 bl pmull_gcm_enc_4x
522
523 tbnz x0, #63, 6f
524 st1 {INP0.16b-INP3.16b}, [x1], #64
525 .if \enc == 1
526 bl pmull_gcm_ghash_4x
527 .endif
528 bne 0b
529
5303: ldr x10, [sp, #.Lframe_local_offset]
531 cbz x10, 5f // output tag?
532
533 ld1 {INP3.16b}, [x10] // load lengths[]
534 mov w9, #1
535 bl pmull_gcm_ghash_4x
536
537 mov w11, #(0x1 << 24) // BE '1U'
538 ld1 {KS0.16b}, [x5]
539 mov KS0.s[3], w11
540
541 enc_block KS0, x7, x6, x12
542
543 ext XL.16b, XL.16b, XL.16b, #8
544 rev64 XL.16b, XL.16b
545 eor XL.16b, XL.16b, KS0.16b
546
547 .if \enc == 1
548 st1 {XL.16b}, [x10] // store tag
549 .else
550 ldp x11, x12, [sp, #40] // load tag pointer and authsize
551 adr_l x17, .Lpermute_table
552 ld1 {KS0.16b}, [x11] // load supplied tag
553 add x17, x17, x12
554 ld1 {KS1.16b}, [x17] // load permute vector
555
556 cmeq XL.16b, XL.16b, KS0.16b // compare tags
557 mvn XL.16b, XL.16b // -1 for fail, 0 for pass
558 tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
559 sminv b0, XL.16b // signed minimum across XL
560 smov w0, v0.b[0] // return b0
561 .endif
562
5634: frame_pop
564 ret
565
5665:
567CPU_LE( rev w8, w8 )
568 str w8, [x5, #12] // store lower counter
569 st1 {XL.2d}, [x4]
570 b 4b
571
5726: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
573 sub x17, x17, x19, lsl #1
574
575 cmp w9, #1
576 beq 7f
577 .subsection 1
5787: ld1 {INP2.16b}, [x1]
579 tbx INP2.16b, {INP3.16b}, T1.16b
580 mov INP3.16b, INP2.16b
581 b 8f
582 .previous
583
584 st1 {INP0.16b}, [x1], x14
585 st1 {INP1.16b}, [x1], x15
586 st1 {INP2.16b}, [x1], x16
587 tbl INP3.16b, {INP3.16b}, T1.16b
588 tbx INP3.16b, {INP2.16b}, T2.16b
5898: st1 {INP3.16b}, [x1]
590
591 .if \enc == 1
592 ld1 {T1.16b}, [x17]
593 tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
594 bl pmull_gcm_ghash_4x
595 .endif
596 b 3b
597 .endm
598
599 /*
600 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601 * struct ghash_key const *k, u64 dg[], u8 ctr[],
602 * int rounds, u8 tag)
603 */
604SYM_FUNC_START(pmull_gcm_encrypt)
605 pmull_gcm_do_crypt 1
606SYM_FUNC_END(pmull_gcm_encrypt)
607
608 /*
609 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610 * struct ghash_key const *k, u64 dg[], u8 ctr[],
611 * int rounds, u8 tag)
612 */
613SYM_FUNC_START(pmull_gcm_decrypt)
614 pmull_gcm_do_crypt 0
615SYM_FUNC_END(pmull_gcm_decrypt)
616
617SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618 movi MASK.16b, #0xe1
619 shl MASK.2d, MASK.2d, #57
620
621 rev64 T1.16b, INP0.16b
622 rev64 T2.16b, INP1.16b
623 rev64 TT3.16b, INP2.16b
624 rev64 TT4.16b, INP3.16b
625
626 ext XL.16b, XL.16b, XL.16b, #8
627
628 tbz w9, #2, 0f // <4 blocks?
629 .subsection 1
6300: movi XH2.16b, #0
631 movi XM2.16b, #0
632 movi XL2.16b, #0
633
634 tbz w9, #0, 1f // 2 blocks?
635 tbz w9, #1, 2f // 1 block?
636
637 eor T2.16b, T2.16b, XL.16b
638 ext T1.16b, T2.16b, T2.16b, #8
639 b .Lgh3
640
6411: eor TT3.16b, TT3.16b, XL.16b
642 ext T2.16b, TT3.16b, TT3.16b, #8
643 b .Lgh2
644
6452: eor TT4.16b, TT4.16b, XL.16b
646 ext IN1.16b, TT4.16b, TT4.16b, #8
647 b .Lgh1
648 .previous
649
650 eor T1.16b, T1.16b, XL.16b
651 ext IN1.16b, T1.16b, T1.16b, #8
652
653 pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
654 eor T1.16b, T1.16b, IN1.16b
655 pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
656 pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
657
658 ext T1.16b, T2.16b, T2.16b, #8
659.Lgh3: eor T2.16b, T2.16b, T1.16b
660 pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
661 pmull XL.1q, HH3.1d, T1.1d // a0 * b0
662 pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
663
664 eor XH2.16b, XH2.16b, XH.16b
665 eor XL2.16b, XL2.16b, XL.16b
666 eor XM2.16b, XM2.16b, XM.16b
667
668 ext T2.16b, TT3.16b, TT3.16b, #8
669.Lgh2: eor TT3.16b, TT3.16b, T2.16b
670 pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
671 pmull XL.1q, HH.1d, T2.1d // a0 * b0
672 pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
673
674 eor XH2.16b, XH2.16b, XH.16b
675 eor XL2.16b, XL2.16b, XL.16b
676 eor XM2.16b, XM2.16b, XM.16b
677
678 ext IN1.16b, TT4.16b, TT4.16b, #8
679.Lgh1: eor TT4.16b, TT4.16b, IN1.16b
680 pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
681 pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
682 pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
683
684 eor XH.16b, XH.16b, XH2.16b
685 eor XL.16b, XL.16b, XL2.16b
686 eor XM.16b, XM.16b, XM2.16b
687
688 eor T2.16b, XL.16b, XH.16b
689 ext T1.16b, XL.16b, XH.16b, #8
690 eor XM.16b, XM.16b, T2.16b
691
692 __pmull_reduce_p64
693
694 eor T2.16b, T2.16b, XH.16b
695 eor XL.16b, XL.16b, T2.16b
696
697 ret
698SYM_FUNC_END(pmull_gcm_ghash_4x)
699
700SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701 ld1 {KS0.16b}, [x5] // load upper counter
702 sub w10, w8, #4
703 sub w11, w8, #3
704 sub w12, w8, #2
705 sub w13, w8, #1
706 rev w10, w10
707 rev w11, w11
708 rev w12, w12
709 rev w13, w13
710 mov KS1.16b, KS0.16b
711 mov KS2.16b, KS0.16b
712 mov KS3.16b, KS0.16b
713 ins KS0.s[3], w10 // set lower counter
714 ins KS1.s[3], w11
715 ins KS2.s[3], w12
716 ins KS3.s[3], w13
717
718 add x10, x6, #96 // round key pointer
719 ld1 {K6.4s-K7.4s}, [x10], #32
720 .irp key, K0, K1, K2, K3, K4, K5
721 enc_qround KS0, KS1, KS2, KS3, \key
722 .endr
723
724 tbnz x7, #2, .Lnot128
725 .subsection 1
726.Lnot128:
727 ld1 {K8.4s-K9.4s}, [x10], #32
728 .irp key, K6, K7
729 enc_qround KS0, KS1, KS2, KS3, \key
730 .endr
731 ld1 {K6.4s-K7.4s}, [x10]
732 .irp key, K8, K9
733 enc_qround KS0, KS1, KS2, KS3, \key
734 .endr
735 tbz x7, #1, .Lout192
736 b .Lout256
737 .previous
738
739.Lout256:
740 .irp key, K6, K7
741 enc_qround KS0, KS1, KS2, KS3, \key
742 .endr
743
744.Lout192:
745 enc_qround KS0, KS1, KS2, KS3, KK
746
747 aese KS0.16b, KL.16b
748 aese KS1.16b, KL.16b
749 aese KS2.16b, KL.16b
750 aese KS3.16b, KL.16b
751
752 eor KS0.16b, KS0.16b, KM.16b
753 eor KS1.16b, KS1.16b, KM.16b
754 eor KS2.16b, KS2.16b, KM.16b
755 eor KS3.16b, KS3.16b, KM.16b
756
757 eor INP0.16b, INP0.16b, KS0.16b
758 eor INP1.16b, INP1.16b, KS1.16b
759 eor INP2.16b, INP2.16b, KS2.16b
760 eor INP3.16b, INP3.16b, KS3.16b
761
762 ret
763SYM_FUNC_END(pmull_gcm_enc_4x)
764
765 .section ".rodata", "a"
766 .align 6
767.Lpermute_table:
768 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
771 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
772 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
775 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
776 .previous
777

source code of linux/arch/arm64/crypto/ghash-ce-core.S