1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. |
4 | * |
5 | * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> |
6 | */ |
7 | |
8 | #include <linux/linkage.h> |
9 | #include <linux/cfi_types.h> |
10 | #include <asm/assembler.h> |
11 | |
12 | SHASH .req v0 |
13 | SHASH2 .req v1 |
14 | T1 .req v2 |
15 | T2 .req v3 |
16 | MASK .req v4 |
17 | XM .req v5 |
18 | XL .req v6 |
19 | XH .req v7 |
20 | IN1 .req v7 |
21 | |
22 | k00_16 .req v8 |
23 | k32_48 .req v9 |
24 | |
25 | t3 .req v10 |
26 | t4 .req v11 |
27 | t5 .req v12 |
28 | t6 .req v13 |
29 | t7 .req v14 |
30 | t8 .req v15 |
31 | t9 .req v16 |
32 | |
33 | perm1 .req v17 |
34 | perm2 .req v18 |
35 | perm3 .req v19 |
36 | |
37 | sh1 .req v20 |
38 | sh2 .req v21 |
39 | sh3 .req v22 |
40 | sh4 .req v23 |
41 | |
42 | ss1 .req v24 |
43 | ss2 .req v25 |
44 | ss3 .req v26 |
45 | ss4 .req v27 |
46 | |
47 | XL2 .req v8 |
48 | XM2 .req v9 |
49 | XH2 .req v10 |
50 | XL3 .req v11 |
51 | XM3 .req v12 |
52 | XH3 .req v13 |
53 | TT3 .req v14 |
54 | TT4 .req v15 |
55 | HH .req v16 |
56 | HH3 .req v17 |
57 | HH4 .req v18 |
58 | HH34 .req v19 |
59 | |
60 | .text |
61 | .arch armv8-a+crypto |
62 | |
63 | .macro __pmull_p64, rd, rn, rm |
64 | pmull \rd\().1q, \rn\().1d, \rm\().1d |
65 | .endm |
66 | |
67 | .macro __pmull2_p64, rd, rn, rm |
68 | pmull2 \rd\().1q, \rn\().2d, \rm\().2d |
69 | .endm |
70 | |
71 | .macro __pmull_p8, rq, ad, bd |
72 | ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 |
73 | ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 |
74 | ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 |
75 | |
76 | __pmull_p8_\bd \rq, \ad |
77 | .endm |
78 | |
79 | .macro __pmull2_p8, rq, ad, bd |
80 | tbl t3.16b, {\ad\().16b}, perm1.16b // A1 |
81 | tbl t5.16b, {\ad\().16b}, perm2.16b // A2 |
82 | tbl t7.16b, {\ad\().16b}, perm3.16b // A3 |
83 | |
84 | __pmull2_p8_\bd \rq, \ad |
85 | .endm |
86 | |
87 | .macro __pmull_p8_SHASH, rq, ad |
88 | __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 |
89 | .endm |
90 | |
91 | .macro __pmull_p8_SHASH2, rq, ad |
92 | __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 |
93 | .endm |
94 | |
95 | .macro __pmull2_p8_SHASH, rq, ad |
96 | __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 |
97 | .endm |
98 | |
99 | .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 |
100 | pmull\t t3.8h, t3.\nb, \bd // F = A1*B |
101 | pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 |
102 | pmull\t t5.8h, t5.\nb, \bd // H = A2*B |
103 | pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 |
104 | pmull\t t7.8h, t7.\nb, \bd // J = A3*B |
105 | pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 |
106 | pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 |
107 | pmull\t \rq\().8h, \ad, \bd // D = A*B |
108 | |
109 | eor t3.16b, t3.16b, t4.16b // L = E + F |
110 | eor t5.16b, t5.16b, t6.16b // M = G + H |
111 | eor t7.16b, t7.16b, t8.16b // N = I + J |
112 | |
113 | uzp1 t4.2d, t3.2d, t5.2d |
114 | uzp2 t3.2d, t3.2d, t5.2d |
115 | uzp1 t6.2d, t7.2d, t9.2d |
116 | uzp2 t7.2d, t7.2d, t9.2d |
117 | |
118 | // t3 = (L) (P0 + P1) << 8 |
119 | // t5 = (M) (P2 + P3) << 16 |
120 | eor t4.16b, t4.16b, t3.16b |
121 | and t3.16b, t3.16b, k32_48.16b |
122 | |
123 | // t7 = (N) (P4 + P5) << 24 |
124 | // t9 = (K) (P6 + P7) << 32 |
125 | eor t6.16b, t6.16b, t7.16b |
126 | and t7.16b, t7.16b, k00_16.16b |
127 | |
128 | eor t4.16b, t4.16b, t3.16b |
129 | eor t6.16b, t6.16b, t7.16b |
130 | |
131 | zip2 t5.2d, t4.2d, t3.2d |
132 | zip1 t3.2d, t4.2d, t3.2d |
133 | zip2 t9.2d, t6.2d, t7.2d |
134 | zip1 t7.2d, t6.2d, t7.2d |
135 | |
136 | ext t3.16b, t3.16b, t3.16b, #15 |
137 | ext t5.16b, t5.16b, t5.16b, #14 |
138 | ext t7.16b, t7.16b, t7.16b, #13 |
139 | ext t9.16b, t9.16b, t9.16b, #12 |
140 | |
141 | eor t3.16b, t3.16b, t5.16b |
142 | eor t7.16b, t7.16b, t9.16b |
143 | eor \rq\().16b, \rq\().16b, t3.16b |
144 | eor \rq\().16b, \rq\().16b, t7.16b |
145 | .endm |
146 | |
147 | .macro __pmull_pre_p64 |
148 | add x8, x3, #16 |
149 | ld1 {HH.2d-HH4.2d}, [x8] |
150 | |
151 | trn1 SHASH2.2d, SHASH.2d, HH.2d |
152 | trn2 T1.2d, SHASH.2d, HH.2d |
153 | eor SHASH2.16b, SHASH2.16b, T1.16b |
154 | |
155 | trn1 HH34.2d, HH3.2d, HH4.2d |
156 | trn2 T1.2d, HH3.2d, HH4.2d |
157 | eor HH34.16b, HH34.16b, T1.16b |
158 | |
159 | movi MASK.16b, #0xe1 |
160 | shl MASK.2d, MASK.2d, #57 |
161 | .endm |
162 | |
163 | .macro __pmull_pre_p8 |
164 | ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 |
165 | eor SHASH2.16b, SHASH2.16b, SHASH.16b |
166 | |
167 | // k00_16 := 0x0000000000000000_000000000000ffff |
168 | // k32_48 := 0x00000000ffffffff_0000ffffffffffff |
169 | movi k32_48.2d, #0xffffffff |
170 | mov k32_48.h[2], k32_48.h[0] |
171 | ushr k00_16.2d, k32_48.2d, #32 |
172 | |
173 | // prepare the permutation vectors |
174 | mov_q x5, 0x080f0e0d0c0b0a09 |
175 | movi T1.8b, #8 |
176 | dup perm1.2d, x5 |
177 | eor perm1.16b, perm1.16b, T1.16b |
178 | ushr perm2.2d, perm1.2d, #8 |
179 | ushr perm3.2d, perm1.2d, #16 |
180 | ushr T1.2d, perm1.2d, #24 |
181 | sli perm2.2d, perm1.2d, #56 |
182 | sli perm3.2d, perm1.2d, #48 |
183 | sli T1.2d, perm1.2d, #40 |
184 | |
185 | // precompute loop invariants |
186 | tbl sh1.16b, {SHASH.16b}, perm1.16b |
187 | tbl sh2.16b, {SHASH.16b}, perm2.16b |
188 | tbl sh3.16b, {SHASH.16b}, perm3.16b |
189 | tbl sh4.16b, {SHASH.16b}, T1.16b |
190 | ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 |
191 | ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 |
192 | ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 |
193 | ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 |
194 | .endm |
195 | |
196 | // |
197 | // PMULL (64x64->128) based reduction for CPUs that can do |
198 | // it in a single instruction. |
199 | // |
200 | .macro __pmull_reduce_p64 |
201 | pmull T2.1q, XL.1d, MASK.1d |
202 | eor XM.16b, XM.16b, T1.16b |
203 | |
204 | mov XH.d[0], XM.d[1] |
205 | mov XM.d[1], XL.d[0] |
206 | |
207 | eor XL.16b, XM.16b, T2.16b |
208 | ext T2.16b, XL.16b, XL.16b, #8 |
209 | pmull XL.1q, XL.1d, MASK.1d |
210 | .endm |
211 | |
212 | // |
213 | // Alternative reduction for CPUs that lack support for the |
214 | // 64x64->128 PMULL instruction |
215 | // |
216 | .macro __pmull_reduce_p8 |
217 | eor XM.16b, XM.16b, T1.16b |
218 | |
219 | mov XL.d[1], XM.d[0] |
220 | mov XH.d[0], XM.d[1] |
221 | |
222 | shl T1.2d, XL.2d, #57 |
223 | shl T2.2d, XL.2d, #62 |
224 | eor T2.16b, T2.16b, T1.16b |
225 | shl T1.2d, XL.2d, #63 |
226 | eor T2.16b, T2.16b, T1.16b |
227 | ext T1.16b, XL.16b, XH.16b, #8 |
228 | eor T2.16b, T2.16b, T1.16b |
229 | |
230 | mov XL.d[1], T2.d[0] |
231 | mov XH.d[0], T2.d[1] |
232 | |
233 | ushr T2.2d, XL.2d, #1 |
234 | eor XH.16b, XH.16b, XL.16b |
235 | eor XL.16b, XL.16b, T2.16b |
236 | ushr T2.2d, T2.2d, #6 |
237 | ushr XL.2d, XL.2d, #1 |
238 | .endm |
239 | |
240 | .macro __pmull_ghash, pn |
241 | ld1 {SHASH.2d}, [x3] |
242 | ld1 {XL.2d}, [x1] |
243 | |
244 | __pmull_pre_\pn |
245 | |
246 | /* do the head block first, if supplied */ |
247 | cbz x4, 0f |
248 | ld1 {T1.2d}, [x4] |
249 | mov x4, xzr |
250 | b 3f |
251 | |
252 | 0: .ifc \pn, p64 |
253 | tbnz w0, #0, 2f // skip until #blocks is a |
254 | tbnz w0, #1, 2f // round multiple of 4 |
255 | |
256 | 1: ld1 {XM3.16b-TT4.16b}, [x2], #64 |
257 | |
258 | sub w0, w0, #4 |
259 | |
260 | rev64 T1.16b, XM3.16b |
261 | rev64 T2.16b, XH3.16b |
262 | rev64 TT4.16b, TT4.16b |
263 | rev64 TT3.16b, TT3.16b |
264 | |
265 | ext IN1.16b, TT4.16b, TT4.16b, #8 |
266 | ext XL3.16b, TT3.16b, TT3.16b, #8 |
267 | |
268 | eor TT4.16b, TT4.16b, IN1.16b |
269 | pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 |
270 | pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 |
271 | pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) |
272 | |
273 | eor TT3.16b, TT3.16b, XL3.16b |
274 | pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 |
275 | pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 |
276 | pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) |
277 | |
278 | ext IN1.16b, T2.16b, T2.16b, #8 |
279 | eor XL2.16b, XL2.16b, XL3.16b |
280 | eor XH2.16b, XH2.16b, XH3.16b |
281 | eor XM2.16b, XM2.16b, XM3.16b |
282 | |
283 | eor T2.16b, T2.16b, IN1.16b |
284 | pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 |
285 | pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 |
286 | pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) |
287 | |
288 | eor XL2.16b, XL2.16b, XL3.16b |
289 | eor XH2.16b, XH2.16b, XH3.16b |
290 | eor XM2.16b, XM2.16b, XM3.16b |
291 | |
292 | ext IN1.16b, T1.16b, T1.16b, #8 |
293 | ext TT3.16b, XL.16b, XL.16b, #8 |
294 | eor XL.16b, XL.16b, IN1.16b |
295 | eor T1.16b, T1.16b, TT3.16b |
296 | |
297 | pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 |
298 | eor T1.16b, T1.16b, XL.16b |
299 | pmull XL.1q, HH4.1d, XL.1d // a0 * b0 |
300 | pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) |
301 | |
302 | eor XL.16b, XL.16b, XL2.16b |
303 | eor XH.16b, XH.16b, XH2.16b |
304 | eor XM.16b, XM.16b, XM2.16b |
305 | |
306 | eor T2.16b, XL.16b, XH.16b |
307 | ext T1.16b, XL.16b, XH.16b, #8 |
308 | eor XM.16b, XM.16b, T2.16b |
309 | |
310 | __pmull_reduce_p64 |
311 | |
312 | eor T2.16b, T2.16b, XH.16b |
313 | eor XL.16b, XL.16b, T2.16b |
314 | |
315 | cbz w0, 5f |
316 | b 1b |
317 | .endif |
318 | |
319 | 2: ld1 {T1.2d}, [x2], #16 |
320 | sub w0, w0, #1 |
321 | |
322 | 3: /* multiply XL by SHASH in GF(2^128) */ |
323 | CPU_LE( rev64 T1.16b, T1.16b ) |
324 | |
325 | ext T2.16b, XL.16b, XL.16b, #8 |
326 | ext IN1.16b, T1.16b, T1.16b, #8 |
327 | eor T1.16b, T1.16b, T2.16b |
328 | eor XL.16b, XL.16b, IN1.16b |
329 | |
330 | __pmull2_\pn XH, XL, SHASH // a1 * b1 |
331 | eor T1.16b, T1.16b, XL.16b |
332 | __pmull_\pn XL, XL, SHASH // a0 * b0 |
333 | __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) |
334 | |
335 | 4: eor T2.16b, XL.16b, XH.16b |
336 | ext T1.16b, XL.16b, XH.16b, #8 |
337 | eor XM.16b, XM.16b, T2.16b |
338 | |
339 | __pmull_reduce_\pn |
340 | |
341 | eor T2.16b, T2.16b, XH.16b |
342 | eor XL.16b, XL.16b, T2.16b |
343 | |
344 | cbnz w0, 0b |
345 | |
346 | 5: st1 {XL.2d}, [x1] |
347 | ret |
348 | .endm |
349 | |
350 | /* |
351 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, |
352 | * struct ghash_key const *k, const char *head) |
353 | */ |
354 | SYM_TYPED_FUNC_START(pmull_ghash_update_p64) |
355 | __pmull_ghash p64 |
356 | SYM_FUNC_END(pmull_ghash_update_p64) |
357 | |
358 | SYM_TYPED_FUNC_START(pmull_ghash_update_p8) |
359 | __pmull_ghash p8 |
360 | SYM_FUNC_END(pmull_ghash_update_p8) |
361 | |
362 | KS0 .req v8 |
363 | KS1 .req v9 |
364 | KS2 .req v10 |
365 | KS3 .req v11 |
366 | |
367 | INP0 .req v21 |
368 | INP1 .req v22 |
369 | INP2 .req v23 |
370 | INP3 .req v24 |
371 | |
372 | K0 .req v25 |
373 | K1 .req v26 |
374 | K2 .req v27 |
375 | K3 .req v28 |
376 | K4 .req v12 |
377 | K5 .req v13 |
378 | K6 .req v4 |
379 | K7 .req v5 |
380 | K8 .req v14 |
381 | K9 .req v15 |
382 | KK .req v29 |
383 | KL .req v30 |
384 | KM .req v31 |
385 | |
386 | .macro load_round_keys, rounds, rk, tmp |
387 | add \tmp, \rk, #64 |
388 | ld1 {K0.4s-K3.4s}, [\rk] |
389 | ld1 {K4.4s-K5.4s}, [\tmp] |
390 | add \tmp, \rk, \rounds, lsl #4 |
391 | sub \tmp, \tmp, #32 |
392 | ld1 {KK.4s-KM.4s}, [\tmp] |
393 | .endm |
394 | |
395 | .macro enc_round, state, key |
396 | aese \state\().16b, \key\().16b |
397 | aesmc \state\().16b, \state\().16b |
398 | .endm |
399 | |
400 | .macro enc_qround, s0, s1, s2, s3, key |
401 | enc_round \s0, \key |
402 | enc_round \s1, \key |
403 | enc_round \s2, \key |
404 | enc_round \s3, \key |
405 | .endm |
406 | |
407 | .macro enc_block, state, rounds, rk, tmp |
408 | add \tmp, \rk, #96 |
409 | ld1 {K6.4s-K7.4s}, [\tmp], #32 |
410 | .irp key, K0, K1, K2, K3, K4 K5 |
411 | enc_round \state, \key |
412 | .endr |
413 | |
414 | tbnz \rounds, #2, .Lnot128_\@ |
415 | .Lout256_\@: |
416 | enc_round \state, K6 |
417 | enc_round \state, K7 |
418 | |
419 | .Lout192_\@: |
420 | enc_round \state, KK |
421 | aese \state\().16b, KL.16b |
422 | eor \state\().16b, \state\().16b, KM.16b |
423 | |
424 | .subsection 1 |
425 | .Lnot128_\@: |
426 | ld1 {K8.4s-K9.4s}, [\tmp], #32 |
427 | enc_round \state, K6 |
428 | enc_round \state, K7 |
429 | ld1 {K6.4s-K7.4s}, [\tmp] |
430 | enc_round \state, K8 |
431 | enc_round \state, K9 |
432 | tbz \rounds, #1, .Lout192_\@ |
433 | b .Lout256_\@ |
434 | .previous |
435 | .endm |
436 | |
437 | .align 6 |
438 | .macro pmull_gcm_do_crypt, enc |
439 | frame_push 1 |
440 | |
441 | load_round_keys x7, x6, x8 |
442 | |
443 | ld1 {SHASH.2d}, [x3], #16 |
444 | ld1 {HH.2d-HH4.2d}, [x3] |
445 | |
446 | trn1 SHASH2.2d, SHASH.2d, HH.2d |
447 | trn2 T1.2d, SHASH.2d, HH.2d |
448 | eor SHASH2.16b, SHASH2.16b, T1.16b |
449 | |
450 | trn1 HH34.2d, HH3.2d, HH4.2d |
451 | trn2 T1.2d, HH3.2d, HH4.2d |
452 | eor HH34.16b, HH34.16b, T1.16b |
453 | |
454 | ld1 {XL.2d}, [x4] |
455 | |
456 | cbz x0, 3f // tag only? |
457 | |
458 | ldr w8, [x5, #12] // load lower counter |
459 | CPU_LE( rev w8, w8 ) |
460 | |
461 | 0: mov w9, #4 // max blocks per round |
462 | add x10, x0, #0xf |
463 | lsr x10, x10, #4 // remaining blocks |
464 | |
465 | subs x0, x0, #64 |
466 | csel w9, w10, w9, mi |
467 | add w8, w8, w9 |
468 | |
469 | bmi 1f |
470 | ld1 {INP0.16b-INP3.16b}, [x2], #64 |
471 | .subsection 1 |
472 | /* |
473 | * Populate the four input registers right to left with up to 63 bytes |
474 | * of data, using overlapping loads to avoid branches. |
475 | * |
476 | * INP0 INP1 INP2 INP3 |
477 | * 1 byte | | | |x | |
478 | * 16 bytes | | | |xxxxxxxx| |
479 | * 17 bytes | | |xxxxxxxx|x | |
480 | * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | |
481 | * etc etc |
482 | * |
483 | * Note that this code may read up to 15 bytes before the start of |
484 | * the input. It is up to the calling code to ensure this is safe if |
485 | * this happens in the first iteration of the loop (i.e., when the |
486 | * input size is < 16 bytes) |
487 | */ |
488 | 1: mov x15, #16 |
489 | ands x19, x0, #0xf |
490 | csel x19, x19, x15, ne |
491 | adr_l x17, .Lpermute_table + 16 |
492 | |
493 | sub x11, x15, x19 |
494 | add x12, x17, x11 |
495 | sub x17, x17, x11 |
496 | ld1 {T1.16b}, [x12] |
497 | sub x10, x1, x11 |
498 | sub x11, x2, x11 |
499 | |
500 | cmp x0, #-16 |
501 | csel x14, x15, xzr, gt |
502 | cmp x0, #-32 |
503 | csel x15, x15, xzr, gt |
504 | cmp x0, #-48 |
505 | csel x16, x19, xzr, gt |
506 | csel x1, x1, x10, gt |
507 | csel x2, x2, x11, gt |
508 | |
509 | ld1 {INP0.16b}, [x2], x14 |
510 | ld1 {INP1.16b}, [x2], x15 |
511 | ld1 {INP2.16b}, [x2], x16 |
512 | ld1 {INP3.16b}, [x2] |
513 | tbl INP3.16b, {INP3.16b}, T1.16b |
514 | b 2f |
515 | .previous |
516 | |
517 | 2: .if \enc == 0 |
518 | bl pmull_gcm_ghash_4x |
519 | .endif |
520 | |
521 | bl pmull_gcm_enc_4x |
522 | |
523 | tbnz x0, #63, 6f |
524 | st1 {INP0.16b-INP3.16b}, [x1], #64 |
525 | .if \enc == 1 |
526 | bl pmull_gcm_ghash_4x |
527 | .endif |
528 | bne 0b |
529 | |
530 | 3: ldr x10, [sp, #.Lframe_local_offset] |
531 | cbz x10, 5f // output tag? |
532 | |
533 | ld1 {INP3.16b}, [x10] // load lengths[] |
534 | mov w9, #1 |
535 | bl pmull_gcm_ghash_4x |
536 | |
537 | mov w11, #(0x1 << 24) // BE '1U' |
538 | ld1 {KS0.16b}, [x5] |
539 | mov KS0.s[3], w11 |
540 | |
541 | enc_block KS0, x7, x6, x12 |
542 | |
543 | ext XL.16b, XL.16b, XL.16b, #8 |
544 | rev64 XL.16b, XL.16b |
545 | eor XL.16b, XL.16b, KS0.16b |
546 | |
547 | .if \enc == 1 |
548 | st1 {XL.16b}, [x10] // store tag |
549 | .else |
550 | ldp x11, x12, [sp, #40] // load tag pointer and authsize |
551 | adr_l x17, .Lpermute_table |
552 | ld1 {KS0.16b}, [x11] // load supplied tag |
553 | add x17, x17, x12 |
554 | ld1 {KS1.16b}, [x17] // load permute vector |
555 | |
556 | cmeq XL.16b, XL.16b, KS0.16b // compare tags |
557 | mvn XL.16b, XL.16b // -1 for fail, 0 for pass |
558 | tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only |
559 | sminv b0, XL.16b // signed minimum across XL |
560 | smov w0, v0.b[0] // return b0 |
561 | .endif |
562 | |
563 | 4: frame_pop |
564 | ret |
565 | |
566 | 5: |
567 | CPU_LE( rev w8, w8 ) |
568 | str w8, [x5, #12] // store lower counter |
569 | st1 {XL.2d}, [x4] |
570 | b 4b |
571 | |
572 | 6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors |
573 | sub x17, x17, x19, lsl #1 |
574 | |
575 | cmp w9, #1 |
576 | beq 7f |
577 | .subsection 1 |
578 | 7: ld1 {INP2.16b}, [x1] |
579 | tbx INP2.16b, {INP3.16b}, T1.16b |
580 | mov INP3.16b, INP2.16b |
581 | b 8f |
582 | .previous |
583 | |
584 | st1 {INP0.16b}, [x1], x14 |
585 | st1 {INP1.16b}, [x1], x15 |
586 | st1 {INP2.16b}, [x1], x16 |
587 | tbl INP3.16b, {INP3.16b}, T1.16b |
588 | tbx INP3.16b, {INP2.16b}, T2.16b |
589 | 8: st1 {INP3.16b}, [x1] |
590 | |
591 | .if \enc == 1 |
592 | ld1 {T1.16b}, [x17] |
593 | tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits |
594 | bl pmull_gcm_ghash_4x |
595 | .endif |
596 | b 3b |
597 | .endm |
598 | |
599 | /* |
600 | * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[], |
601 | * struct ghash_key const *k, u64 dg[], u8 ctr[], |
602 | * int rounds, u8 tag) |
603 | */ |
604 | SYM_FUNC_START(pmull_gcm_encrypt) |
605 | pmull_gcm_do_crypt 1 |
606 | SYM_FUNC_END(pmull_gcm_encrypt) |
607 | |
608 | /* |
609 | * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[], |
610 | * struct ghash_key const *k, u64 dg[], u8 ctr[], |
611 | * int rounds, u8 tag) |
612 | */ |
613 | SYM_FUNC_START(pmull_gcm_decrypt) |
614 | pmull_gcm_do_crypt 0 |
615 | SYM_FUNC_END(pmull_gcm_decrypt) |
616 | |
617 | SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) |
618 | movi MASK.16b, #0xe1 |
619 | shl MASK.2d, MASK.2d, #57 |
620 | |
621 | rev64 T1.16b, INP0.16b |
622 | rev64 T2.16b, INP1.16b |
623 | rev64 TT3.16b, INP2.16b |
624 | rev64 TT4.16b, INP3.16b |
625 | |
626 | ext XL.16b, XL.16b, XL.16b, #8 |
627 | |
628 | tbz w9, #2, 0f // <4 blocks? |
629 | .subsection 1 |
630 | 0: movi XH2.16b, #0 |
631 | movi XM2.16b, #0 |
632 | movi XL2.16b, #0 |
633 | |
634 | tbz w9, #0, 1f // 2 blocks? |
635 | tbz w9, #1, 2f // 1 block? |
636 | |
637 | eor T2.16b, T2.16b, XL.16b |
638 | ext T1.16b, T2.16b, T2.16b, #8 |
639 | b .Lgh3 |
640 | |
641 | 1: eor TT3.16b, TT3.16b, XL.16b |
642 | ext T2.16b, TT3.16b, TT3.16b, #8 |
643 | b .Lgh2 |
644 | |
645 | 2: eor TT4.16b, TT4.16b, XL.16b |
646 | ext IN1.16b, TT4.16b, TT4.16b, #8 |
647 | b .Lgh1 |
648 | .previous |
649 | |
650 | eor T1.16b, T1.16b, XL.16b |
651 | ext IN1.16b, T1.16b, T1.16b, #8 |
652 | |
653 | pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 |
654 | eor T1.16b, T1.16b, IN1.16b |
655 | pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 |
656 | pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) |
657 | |
658 | ext T1.16b, T2.16b, T2.16b, #8 |
659 | .Lgh3: eor T2.16b, T2.16b, T1.16b |
660 | pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 |
661 | pmull XL.1q, HH3.1d, T1.1d // a0 * b0 |
662 | pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) |
663 | |
664 | eor XH2.16b, XH2.16b, XH.16b |
665 | eor XL2.16b, XL2.16b, XL.16b |
666 | eor XM2.16b, XM2.16b, XM.16b |
667 | |
668 | ext T2.16b, TT3.16b, TT3.16b, #8 |
669 | .Lgh2: eor TT3.16b, TT3.16b, T2.16b |
670 | pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 |
671 | pmull XL.1q, HH.1d, T2.1d // a0 * b0 |
672 | pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) |
673 | |
674 | eor XH2.16b, XH2.16b, XH.16b |
675 | eor XL2.16b, XL2.16b, XL.16b |
676 | eor XM2.16b, XM2.16b, XM.16b |
677 | |
678 | ext IN1.16b, TT4.16b, TT4.16b, #8 |
679 | .Lgh1: eor TT4.16b, TT4.16b, IN1.16b |
680 | pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 |
681 | pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 |
682 | pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) |
683 | |
684 | eor XH.16b, XH.16b, XH2.16b |
685 | eor XL.16b, XL.16b, XL2.16b |
686 | eor XM.16b, XM.16b, XM2.16b |
687 | |
688 | eor T2.16b, XL.16b, XH.16b |
689 | ext T1.16b, XL.16b, XH.16b, #8 |
690 | eor XM.16b, XM.16b, T2.16b |
691 | |
692 | __pmull_reduce_p64 |
693 | |
694 | eor T2.16b, T2.16b, XH.16b |
695 | eor XL.16b, XL.16b, T2.16b |
696 | |
697 | ret |
698 | SYM_FUNC_END(pmull_gcm_ghash_4x) |
699 | |
700 | SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) |
701 | ld1 {KS0.16b}, [x5] // load upper counter |
702 | sub w10, w8, #4 |
703 | sub w11, w8, #3 |
704 | sub w12, w8, #2 |
705 | sub w13, w8, #1 |
706 | rev w10, w10 |
707 | rev w11, w11 |
708 | rev w12, w12 |
709 | rev w13, w13 |
710 | mov KS1.16b, KS0.16b |
711 | mov KS2.16b, KS0.16b |
712 | mov KS3.16b, KS0.16b |
713 | ins KS0.s[3], w10 // set lower counter |
714 | ins KS1.s[3], w11 |
715 | ins KS2.s[3], w12 |
716 | ins KS3.s[3], w13 |
717 | |
718 | add x10, x6, #96 // round key pointer |
719 | ld1 {K6.4s-K7.4s}, [x10], #32 |
720 | .irp key, K0, K1, K2, K3, K4, K5 |
721 | enc_qround KS0, KS1, KS2, KS3, \key |
722 | .endr |
723 | |
724 | tbnz x7, #2, .Lnot128 |
725 | .subsection 1 |
726 | .Lnot128: |
727 | ld1 {K8.4s-K9.4s}, [x10], #32 |
728 | .irp key, K6, K7 |
729 | enc_qround KS0, KS1, KS2, KS3, \key |
730 | .endr |
731 | ld1 {K6.4s-K7.4s}, [x10] |
732 | .irp key, K8, K9 |
733 | enc_qround KS0, KS1, KS2, KS3, \key |
734 | .endr |
735 | tbz x7, #1, .Lout192 |
736 | b .Lout256 |
737 | .previous |
738 | |
739 | .Lout256: |
740 | .irp key, K6, K7 |
741 | enc_qround KS0, KS1, KS2, KS3, \key |
742 | .endr |
743 | |
744 | .Lout192: |
745 | enc_qround KS0, KS1, KS2, KS3, KK |
746 | |
747 | aese KS0.16b, KL.16b |
748 | aese KS1.16b, KL.16b |
749 | aese KS2.16b, KL.16b |
750 | aese KS3.16b, KL.16b |
751 | |
752 | eor KS0.16b, KS0.16b, KM.16b |
753 | eor KS1.16b, KS1.16b, KM.16b |
754 | eor KS2.16b, KS2.16b, KM.16b |
755 | eor KS3.16b, KS3.16b, KM.16b |
756 | |
757 | eor INP0.16b, INP0.16b, KS0.16b |
758 | eor INP1.16b, INP1.16b, KS1.16b |
759 | eor INP2.16b, INP2.16b, KS2.16b |
760 | eor INP3.16b, INP3.16b, KS3.16b |
761 | |
762 | ret |
763 | SYM_FUNC_END(pmull_gcm_enc_4x) |
764 | |
765 | .section ".rodata" , "a" |
766 | .align 6 |
767 | .Lpermute_table: |
768 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
769 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
770 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
771 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
772 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
773 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
774 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
775 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
776 | .previous |
777 | |