1 | /* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ |
2 | /* |
3 | * AES CTR mode by8 optimization with AVX instructions. (x86_64) |
4 | * |
5 | * Copyright(c) 2014 Intel Corporation. |
6 | * |
7 | * Contact Information: |
8 | * James Guilford <james.guilford@intel.com> |
9 | * Sean Gulley <sean.m.gulley@intel.com> |
10 | * Chandramouli Narayanan <mouli@linux.intel.com> |
11 | */ |
12 | /* |
13 | * This is AES128/192/256 CTR mode optimization implementation. It requires |
14 | * the support of Intel(R) AESNI and AVX instructions. |
15 | * |
16 | * This work was inspired by the AES CTR mode optimization published |
17 | * in Intel Optimized IPSEC Cryptographic library. |
18 | * Additional information on it can be found at: |
19 | * https://github.com/intel/intel-ipsec-mb |
20 | */ |
21 | |
22 | #include <linux/linkage.h> |
23 | |
24 | #define VMOVDQ vmovdqu |
25 | |
26 | /* |
27 | * Note: the "x" prefix in these aliases means "this is an xmm register". The |
28 | * alias prefixes have no relation to XCTR where the "X" prefix means "XOR |
29 | * counter". |
30 | */ |
31 | #define xdata0 %xmm0 |
32 | #define xdata1 %xmm1 |
33 | #define xdata2 %xmm2 |
34 | #define xdata3 %xmm3 |
35 | #define xdata4 %xmm4 |
36 | #define xdata5 %xmm5 |
37 | #define xdata6 %xmm6 |
38 | #define xdata7 %xmm7 |
39 | #define xcounter %xmm8 // CTR mode only |
40 | #define xiv %xmm8 // XCTR mode only |
41 | #define xbyteswap %xmm9 // CTR mode only |
42 | #define xtmp %xmm9 // XCTR mode only |
43 | #define xkey0 %xmm10 |
44 | #define xkey4 %xmm11 |
45 | #define xkey8 %xmm12 |
46 | #define xkey12 %xmm13 |
47 | #define xkeyA %xmm14 |
48 | #define xkeyB %xmm15 |
49 | |
50 | #define p_in %rdi |
51 | #define p_iv %rsi |
52 | #define p_keys %rdx |
53 | #define p_out %rcx |
54 | #define num_bytes %r8 |
55 | #define counter %r9 // XCTR mode only |
56 | #define tmp %r10 |
57 | #define DDQ_DATA 0 |
58 | #define XDATA 1 |
59 | #define KEY_128 1 |
60 | #define KEY_192 2 |
61 | #define KEY_256 3 |
62 | |
63 | .section .rodata |
64 | .align 16 |
65 | |
66 | byteswap_const: |
67 | .octa 0x000102030405060708090A0B0C0D0E0F |
68 | ddq_low_msk: |
69 | .octa 0x0000000000000000FFFFFFFFFFFFFFFF |
70 | ddq_high_add_1: |
71 | .octa 0x00000000000000010000000000000000 |
72 | ddq_add_1: |
73 | .octa 0x00000000000000000000000000000001 |
74 | ddq_add_2: |
75 | .octa 0x00000000000000000000000000000002 |
76 | ddq_add_3: |
77 | .octa 0x00000000000000000000000000000003 |
78 | ddq_add_4: |
79 | .octa 0x00000000000000000000000000000004 |
80 | ddq_add_5: |
81 | .octa 0x00000000000000000000000000000005 |
82 | ddq_add_6: |
83 | .octa 0x00000000000000000000000000000006 |
84 | ddq_add_7: |
85 | .octa 0x00000000000000000000000000000007 |
86 | ddq_add_8: |
87 | .octa 0x00000000000000000000000000000008 |
88 | |
89 | .text |
90 | |
91 | /* generate a unique variable for ddq_add_x */ |
92 | |
93 | /* generate a unique variable for xmm register */ |
94 | .macro setxdata n |
95 | var_xdata = %xmm\n |
96 | .endm |
97 | |
98 | /* club the numeric 'id' to the symbol 'name' */ |
99 | |
100 | .macro club name, id |
101 | .altmacro |
102 | .if \name == XDATA |
103 | setxdata %\id |
104 | .endif |
105 | .noaltmacro |
106 | .endm |
107 | |
108 | /* |
109 | * do_aes num_in_par load_keys key_len |
110 | * This increments p_in, but not p_out |
111 | */ |
112 | .macro do_aes b, k, key_len, xctr |
113 | .set by, \b |
114 | .set load_keys, \k |
115 | .set klen, \key_len |
116 | |
117 | .if (load_keys) |
118 | vmovdqa 0*16(p_keys), xkey0 |
119 | .endif |
120 | |
121 | .if \xctr |
122 | movq counter, xtmp |
123 | .set i, 0 |
124 | .rept (by) |
125 | club XDATA, i |
126 | vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata |
127 | .set i, (i +1) |
128 | .endr |
129 | .set i, 0 |
130 | .rept (by) |
131 | club XDATA, i |
132 | vpxor xiv, var_xdata, var_xdata |
133 | .set i, (i +1) |
134 | .endr |
135 | .else |
136 | vpshufb xbyteswap, xcounter, xdata0 |
137 | .set i, 1 |
138 | .rept (by - 1) |
139 | club XDATA, i |
140 | vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata |
141 | vptest ddq_low_msk(%rip), var_xdata |
142 | jnz 1f |
143 | vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata |
144 | vpaddq ddq_high_add_1(%rip), xcounter, xcounter |
145 | 1: |
146 | vpshufb xbyteswap, var_xdata, var_xdata |
147 | .set i, (i +1) |
148 | .endr |
149 | .endif |
150 | |
151 | vmovdqa 1*16(p_keys), xkeyA |
152 | |
153 | vpxor xkey0, xdata0, xdata0 |
154 | .if \xctr |
155 | add $by, counter |
156 | .else |
157 | vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter |
158 | vptest ddq_low_msk(%rip), xcounter |
159 | jnz 1f |
160 | vpaddq ddq_high_add_1(%rip), xcounter, xcounter |
161 | 1: |
162 | .endif |
163 | |
164 | .set i, 1 |
165 | .rept (by - 1) |
166 | club XDATA, i |
167 | vpxor xkey0, var_xdata, var_xdata |
168 | .set i, (i +1) |
169 | .endr |
170 | |
171 | vmovdqa 2*16(p_keys), xkeyB |
172 | |
173 | .set i, 0 |
174 | .rept by |
175 | club XDATA, i |
176 | vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ |
177 | .set i, (i +1) |
178 | .endr |
179 | |
180 | .if (klen == KEY_128) |
181 | .if (load_keys) |
182 | vmovdqa 3*16(p_keys), xkey4 |
183 | .endif |
184 | .else |
185 | vmovdqa 3*16(p_keys), xkeyA |
186 | .endif |
187 | |
188 | .set i, 0 |
189 | .rept by |
190 | club XDATA, i |
191 | vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ |
192 | .set i, (i +1) |
193 | .endr |
194 | |
195 | add $(16*by), p_in |
196 | |
197 | .if (klen == KEY_128) |
198 | vmovdqa 4*16(p_keys), xkeyB |
199 | .else |
200 | .if (load_keys) |
201 | vmovdqa 4*16(p_keys), xkey4 |
202 | .endif |
203 | .endif |
204 | |
205 | .set i, 0 |
206 | .rept by |
207 | club XDATA, i |
208 | /* key 3 */ |
209 | .if (klen == KEY_128) |
210 | vaesenc xkey4, var_xdata, var_xdata |
211 | .else |
212 | vaesenc xkeyA, var_xdata, var_xdata |
213 | .endif |
214 | .set i, (i +1) |
215 | .endr |
216 | |
217 | vmovdqa 5*16(p_keys), xkeyA |
218 | |
219 | .set i, 0 |
220 | .rept by |
221 | club XDATA, i |
222 | /* key 4 */ |
223 | .if (klen == KEY_128) |
224 | vaesenc xkeyB, var_xdata, var_xdata |
225 | .else |
226 | vaesenc xkey4, var_xdata, var_xdata |
227 | .endif |
228 | .set i, (i +1) |
229 | .endr |
230 | |
231 | .if (klen == KEY_128) |
232 | .if (load_keys) |
233 | vmovdqa 6*16(p_keys), xkey8 |
234 | .endif |
235 | .else |
236 | vmovdqa 6*16(p_keys), xkeyB |
237 | .endif |
238 | |
239 | .set i, 0 |
240 | .rept by |
241 | club XDATA, i |
242 | vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ |
243 | .set i, (i +1) |
244 | .endr |
245 | |
246 | vmovdqa 7*16(p_keys), xkeyA |
247 | |
248 | .set i, 0 |
249 | .rept by |
250 | club XDATA, i |
251 | /* key 6 */ |
252 | .if (klen == KEY_128) |
253 | vaesenc xkey8, var_xdata, var_xdata |
254 | .else |
255 | vaesenc xkeyB, var_xdata, var_xdata |
256 | .endif |
257 | .set i, (i +1) |
258 | .endr |
259 | |
260 | .if (klen == KEY_128) |
261 | vmovdqa 8*16(p_keys), xkeyB |
262 | .else |
263 | .if (load_keys) |
264 | vmovdqa 8*16(p_keys), xkey8 |
265 | .endif |
266 | .endif |
267 | |
268 | .set i, 0 |
269 | .rept by |
270 | club XDATA, i |
271 | vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ |
272 | .set i, (i +1) |
273 | .endr |
274 | |
275 | .if (klen == KEY_128) |
276 | .if (load_keys) |
277 | vmovdqa 9*16(p_keys), xkey12 |
278 | .endif |
279 | .else |
280 | vmovdqa 9*16(p_keys), xkeyA |
281 | .endif |
282 | |
283 | .set i, 0 |
284 | .rept by |
285 | club XDATA, i |
286 | /* key 8 */ |
287 | .if (klen == KEY_128) |
288 | vaesenc xkeyB, var_xdata, var_xdata |
289 | .else |
290 | vaesenc xkey8, var_xdata, var_xdata |
291 | .endif |
292 | .set i, (i +1) |
293 | .endr |
294 | |
295 | vmovdqa 10*16(p_keys), xkeyB |
296 | |
297 | .set i, 0 |
298 | .rept by |
299 | club XDATA, i |
300 | /* key 9 */ |
301 | .if (klen == KEY_128) |
302 | vaesenc xkey12, var_xdata, var_xdata |
303 | .else |
304 | vaesenc xkeyA, var_xdata, var_xdata |
305 | .endif |
306 | .set i, (i +1) |
307 | .endr |
308 | |
309 | .if (klen != KEY_128) |
310 | vmovdqa 11*16(p_keys), xkeyA |
311 | .endif |
312 | |
313 | .set i, 0 |
314 | .rept by |
315 | club XDATA, i |
316 | /* key 10 */ |
317 | .if (klen == KEY_128) |
318 | vaesenclast xkeyB, var_xdata, var_xdata |
319 | .else |
320 | vaesenc xkeyB, var_xdata, var_xdata |
321 | .endif |
322 | .set i, (i +1) |
323 | .endr |
324 | |
325 | .if (klen != KEY_128) |
326 | .if (load_keys) |
327 | vmovdqa 12*16(p_keys), xkey12 |
328 | .endif |
329 | |
330 | .set i, 0 |
331 | .rept by |
332 | club XDATA, i |
333 | vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ |
334 | .set i, (i +1) |
335 | .endr |
336 | |
337 | .if (klen == KEY_256) |
338 | vmovdqa 13*16(p_keys), xkeyA |
339 | .endif |
340 | |
341 | .set i, 0 |
342 | .rept by |
343 | club XDATA, i |
344 | .if (klen == KEY_256) |
345 | /* key 12 */ |
346 | vaesenc xkey12, var_xdata, var_xdata |
347 | .else |
348 | vaesenclast xkey12, var_xdata, var_xdata |
349 | .endif |
350 | .set i, (i +1) |
351 | .endr |
352 | |
353 | .if (klen == KEY_256) |
354 | vmovdqa 14*16(p_keys), xkeyB |
355 | |
356 | .set i, 0 |
357 | .rept by |
358 | club XDATA, i |
359 | /* key 13 */ |
360 | vaesenc xkeyA, var_xdata, var_xdata |
361 | .set i, (i +1) |
362 | .endr |
363 | |
364 | .set i, 0 |
365 | .rept by |
366 | club XDATA, i |
367 | /* key 14 */ |
368 | vaesenclast xkeyB, var_xdata, var_xdata |
369 | .set i, (i +1) |
370 | .endr |
371 | .endif |
372 | .endif |
373 | |
374 | .set i, 0 |
375 | .rept (by / 2) |
376 | .set j, (i+1) |
377 | VMOVDQ (i*16 - 16*by)(p_in), xkeyA |
378 | VMOVDQ (j*16 - 16*by)(p_in), xkeyB |
379 | club XDATA, i |
380 | vpxor xkeyA, var_xdata, var_xdata |
381 | club XDATA, j |
382 | vpxor xkeyB, var_xdata, var_xdata |
383 | .set i, (i+2) |
384 | .endr |
385 | |
386 | .if (i < by) |
387 | VMOVDQ (i*16 - 16*by)(p_in), xkeyA |
388 | club XDATA, i |
389 | vpxor xkeyA, var_xdata, var_xdata |
390 | .endif |
391 | |
392 | .set i, 0 |
393 | .rept by |
394 | club XDATA, i |
395 | VMOVDQ var_xdata, i*16(p_out) |
396 | .set i, (i+1) |
397 | .endr |
398 | .endm |
399 | |
400 | .macro do_aes_load val, key_len, xctr |
401 | do_aes \val, 1, \key_len, \xctr |
402 | .endm |
403 | |
404 | .macro do_aes_noload val, key_len, xctr |
405 | do_aes \val, 0, \key_len, \xctr |
406 | .endm |
407 | |
408 | /* main body of aes ctr load */ |
409 | |
410 | .macro do_aes_ctrmain key_len, xctr |
411 | cmp $16, num_bytes |
412 | jb .Ldo_return2\xctr\key_len |
413 | |
414 | .if \xctr |
415 | shr $4, counter |
416 | vmovdqu (p_iv), xiv |
417 | .else |
418 | vmovdqa byteswap_const(%rip), xbyteswap |
419 | vmovdqu (p_iv), xcounter |
420 | vpshufb xbyteswap, xcounter, xcounter |
421 | .endif |
422 | |
423 | mov num_bytes, tmp |
424 | and $(7*16), tmp |
425 | jz .Lmult_of_8_blks\xctr\key_len |
426 | |
427 | /* 1 <= tmp <= 7 */ |
428 | cmp $(4*16), tmp |
429 | jg .Lgt4\xctr\key_len |
430 | je .Leq4\xctr\key_len |
431 | |
432 | .Llt4\xctr\key_len: |
433 | cmp $(2*16), tmp |
434 | jg .Leq3\xctr\key_len |
435 | je .Leq2\xctr\key_len |
436 | |
437 | .Leq1\xctr\key_len: |
438 | do_aes_load 1, \key_len, \xctr |
439 | add $(1*16), p_out |
440 | and $(~7*16), num_bytes |
441 | jz .Ldo_return2\xctr\key_len |
442 | jmp .Lmain_loop2\xctr\key_len |
443 | |
444 | .Leq2\xctr\key_len: |
445 | do_aes_load 2, \key_len, \xctr |
446 | add $(2*16), p_out |
447 | and $(~7*16), num_bytes |
448 | jz .Ldo_return2\xctr\key_len |
449 | jmp .Lmain_loop2\xctr\key_len |
450 | |
451 | |
452 | .Leq3\xctr\key_len: |
453 | do_aes_load 3, \key_len, \xctr |
454 | add $(3*16), p_out |
455 | and $(~7*16), num_bytes |
456 | jz .Ldo_return2\xctr\key_len |
457 | jmp .Lmain_loop2\xctr\key_len |
458 | |
459 | .Leq4\xctr\key_len: |
460 | do_aes_load 4, \key_len, \xctr |
461 | add $(4*16), p_out |
462 | and $(~7*16), num_bytes |
463 | jz .Ldo_return2\xctr\key_len |
464 | jmp .Lmain_loop2\xctr\key_len |
465 | |
466 | .Lgt4\xctr\key_len: |
467 | cmp $(6*16), tmp |
468 | jg .Leq7\xctr\key_len |
469 | je .Leq6\xctr\key_len |
470 | |
471 | .Leq5\xctr\key_len: |
472 | do_aes_load 5, \key_len, \xctr |
473 | add $(5*16), p_out |
474 | and $(~7*16), num_bytes |
475 | jz .Ldo_return2\xctr\key_len |
476 | jmp .Lmain_loop2\xctr\key_len |
477 | |
478 | .Leq6\xctr\key_len: |
479 | do_aes_load 6, \key_len, \xctr |
480 | add $(6*16), p_out |
481 | and $(~7*16), num_bytes |
482 | jz .Ldo_return2\xctr\key_len |
483 | jmp .Lmain_loop2\xctr\key_len |
484 | |
485 | .Leq7\xctr\key_len: |
486 | do_aes_load 7, \key_len, \xctr |
487 | add $(7*16), p_out |
488 | and $(~7*16), num_bytes |
489 | jz .Ldo_return2\xctr\key_len |
490 | jmp .Lmain_loop2\xctr\key_len |
491 | |
492 | .Lmult_of_8_blks\xctr\key_len: |
493 | .if (\key_len != KEY_128) |
494 | vmovdqa 0*16(p_keys), xkey0 |
495 | vmovdqa 4*16(p_keys), xkey4 |
496 | vmovdqa 8*16(p_keys), xkey8 |
497 | vmovdqa 12*16(p_keys), xkey12 |
498 | .else |
499 | vmovdqa 0*16(p_keys), xkey0 |
500 | vmovdqa 3*16(p_keys), xkey4 |
501 | vmovdqa 6*16(p_keys), xkey8 |
502 | vmovdqa 9*16(p_keys), xkey12 |
503 | .endif |
504 | .align 16 |
505 | .Lmain_loop2\xctr\key_len: |
506 | /* num_bytes is a multiple of 8 and >0 */ |
507 | do_aes_noload 8, \key_len, \xctr |
508 | add $(8*16), p_out |
509 | sub $(8*16), num_bytes |
510 | jne .Lmain_loop2\xctr\key_len |
511 | |
512 | .Ldo_return2\xctr\key_len: |
513 | .if !\xctr |
514 | /* return updated IV */ |
515 | vpshufb xbyteswap, xcounter, xcounter |
516 | vmovdqu xcounter, (p_iv) |
517 | .endif |
518 | RET |
519 | .endm |
520 | |
521 | /* |
522 | * routine to do AES128 CTR enc/decrypt "by8" |
523 | * XMM registers are clobbered. |
524 | * Saving/restoring must be done at a higher level |
525 | * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, |
526 | * unsigned int num_bytes) |
527 | */ |
528 | SYM_FUNC_START(aes_ctr_enc_128_avx_by8) |
529 | /* call the aes main loop */ |
530 | do_aes_ctrmain KEY_128 0 |
531 | |
532 | SYM_FUNC_END(aes_ctr_enc_128_avx_by8) |
533 | |
534 | /* |
535 | * routine to do AES192 CTR enc/decrypt "by8" |
536 | * XMM registers are clobbered. |
537 | * Saving/restoring must be done at a higher level |
538 | * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, |
539 | * unsigned int num_bytes) |
540 | */ |
541 | SYM_FUNC_START(aes_ctr_enc_192_avx_by8) |
542 | /* call the aes main loop */ |
543 | do_aes_ctrmain KEY_192 0 |
544 | |
545 | SYM_FUNC_END(aes_ctr_enc_192_avx_by8) |
546 | |
547 | /* |
548 | * routine to do AES256 CTR enc/decrypt "by8" |
549 | * XMM registers are clobbered. |
550 | * Saving/restoring must be done at a higher level |
551 | * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, |
552 | * unsigned int num_bytes) |
553 | */ |
554 | SYM_FUNC_START(aes_ctr_enc_256_avx_by8) |
555 | /* call the aes main loop */ |
556 | do_aes_ctrmain KEY_256 0 |
557 | |
558 | SYM_FUNC_END(aes_ctr_enc_256_avx_by8) |
559 | |
560 | /* |
561 | * routine to do AES128 XCTR enc/decrypt "by8" |
562 | * XMM registers are clobbered. |
563 | * Saving/restoring must be done at a higher level |
564 | * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, |
565 | * u8* out, unsigned int num_bytes, unsigned int byte_ctr) |
566 | */ |
567 | SYM_FUNC_START(aes_xctr_enc_128_avx_by8) |
568 | /* call the aes main loop */ |
569 | do_aes_ctrmain KEY_128 1 |
570 | |
571 | SYM_FUNC_END(aes_xctr_enc_128_avx_by8) |
572 | |
573 | /* |
574 | * routine to do AES192 XCTR enc/decrypt "by8" |
575 | * XMM registers are clobbered. |
576 | * Saving/restoring must be done at a higher level |
577 | * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, |
578 | * u8* out, unsigned int num_bytes, unsigned int byte_ctr) |
579 | */ |
580 | SYM_FUNC_START(aes_xctr_enc_192_avx_by8) |
581 | /* call the aes main loop */ |
582 | do_aes_ctrmain KEY_192 1 |
583 | |
584 | SYM_FUNC_END(aes_xctr_enc_192_avx_by8) |
585 | |
586 | /* |
587 | * routine to do AES256 XCTR enc/decrypt "by8" |
588 | * XMM registers are clobbered. |
589 | * Saving/restoring must be done at a higher level |
590 | * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, |
591 | * u8* out, unsigned int num_bytes, unsigned int byte_ctr) |
592 | */ |
593 | SYM_FUNC_START(aes_xctr_enc_256_avx_by8) |
594 | /* call the aes main loop */ |
595 | do_aes_ctrmain KEY_256 1 |
596 | |
597 | SYM_FUNC_END(aes_xctr_enc_256_avx_by8) |
598 | |