1/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
2/*
3 * AES CTR mode by8 optimization with AVX instructions. (x86_64)
4 *
5 * Copyright(c) 2014 Intel Corporation.
6 *
7 * Contact Information:
8 * James Guilford <james.guilford@intel.com>
9 * Sean Gulley <sean.m.gulley@intel.com>
10 * Chandramouli Narayanan <mouli@linux.intel.com>
11 */
12/*
13 * This is AES128/192/256 CTR mode optimization implementation. It requires
14 * the support of Intel(R) AESNI and AVX instructions.
15 *
16 * This work was inspired by the AES CTR mode optimization published
17 * in Intel Optimized IPSEC Cryptographic library.
18 * Additional information on it can be found at:
19 * https://github.com/intel/intel-ipsec-mb
20 */
21
22#include <linux/linkage.h>
23
24#define VMOVDQ vmovdqu
25
26/*
27 * Note: the "x" prefix in these aliases means "this is an xmm register". The
28 * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
29 * counter".
30 */
31#define xdata0 %xmm0
32#define xdata1 %xmm1
33#define xdata2 %xmm2
34#define xdata3 %xmm3
35#define xdata4 %xmm4
36#define xdata5 %xmm5
37#define xdata6 %xmm6
38#define xdata7 %xmm7
39#define xcounter %xmm8 // CTR mode only
40#define xiv %xmm8 // XCTR mode only
41#define xbyteswap %xmm9 // CTR mode only
42#define xtmp %xmm9 // XCTR mode only
43#define xkey0 %xmm10
44#define xkey4 %xmm11
45#define xkey8 %xmm12
46#define xkey12 %xmm13
47#define xkeyA %xmm14
48#define xkeyB %xmm15
49
50#define p_in %rdi
51#define p_iv %rsi
52#define p_keys %rdx
53#define p_out %rcx
54#define num_bytes %r8
55#define counter %r9 // XCTR mode only
56#define tmp %r10
57#define DDQ_DATA 0
58#define XDATA 1
59#define KEY_128 1
60#define KEY_192 2
61#define KEY_256 3
62
63.section .rodata
64.align 16
65
66byteswap_const:
67 .octa 0x000102030405060708090A0B0C0D0E0F
68ddq_low_msk:
69 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
70ddq_high_add_1:
71 .octa 0x00000000000000010000000000000000
72ddq_add_1:
73 .octa 0x00000000000000000000000000000001
74ddq_add_2:
75 .octa 0x00000000000000000000000000000002
76ddq_add_3:
77 .octa 0x00000000000000000000000000000003
78ddq_add_4:
79 .octa 0x00000000000000000000000000000004
80ddq_add_5:
81 .octa 0x00000000000000000000000000000005
82ddq_add_6:
83 .octa 0x00000000000000000000000000000006
84ddq_add_7:
85 .octa 0x00000000000000000000000000000007
86ddq_add_8:
87 .octa 0x00000000000000000000000000000008
88
89.text
90
91/* generate a unique variable for ddq_add_x */
92
93/* generate a unique variable for xmm register */
94.macro setxdata n
95 var_xdata = %xmm\n
96.endm
97
98/* club the numeric 'id' to the symbol 'name' */
99
100.macro club name, id
101.altmacro
102 .if \name == XDATA
103 setxdata %\id
104 .endif
105.noaltmacro
106.endm
107
108/*
109 * do_aes num_in_par load_keys key_len
110 * This increments p_in, but not p_out
111 */
112.macro do_aes b, k, key_len, xctr
113 .set by, \b
114 .set load_keys, \k
115 .set klen, \key_len
116
117 .if (load_keys)
118 vmovdqa 0*16(p_keys), xkey0
119 .endif
120
121 .if \xctr
122 movq counter, xtmp
123 .set i, 0
124 .rept (by)
125 club XDATA, i
126 vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
127 .set i, (i +1)
128 .endr
129 .set i, 0
130 .rept (by)
131 club XDATA, i
132 vpxor xiv, var_xdata, var_xdata
133 .set i, (i +1)
134 .endr
135 .else
136 vpshufb xbyteswap, xcounter, xdata0
137 .set i, 1
138 .rept (by - 1)
139 club XDATA, i
140 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
141 vptest ddq_low_msk(%rip), var_xdata
142 jnz 1f
143 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
144 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
145 1:
146 vpshufb xbyteswap, var_xdata, var_xdata
147 .set i, (i +1)
148 .endr
149 .endif
150
151 vmovdqa 1*16(p_keys), xkeyA
152
153 vpxor xkey0, xdata0, xdata0
154 .if \xctr
155 add $by, counter
156 .else
157 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
158 vptest ddq_low_msk(%rip), xcounter
159 jnz 1f
160 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
161 1:
162 .endif
163
164 .set i, 1
165 .rept (by - 1)
166 club XDATA, i
167 vpxor xkey0, var_xdata, var_xdata
168 .set i, (i +1)
169 .endr
170
171 vmovdqa 2*16(p_keys), xkeyB
172
173 .set i, 0
174 .rept by
175 club XDATA, i
176 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
177 .set i, (i +1)
178 .endr
179
180 .if (klen == KEY_128)
181 .if (load_keys)
182 vmovdqa 3*16(p_keys), xkey4
183 .endif
184 .else
185 vmovdqa 3*16(p_keys), xkeyA
186 .endif
187
188 .set i, 0
189 .rept by
190 club XDATA, i
191 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
192 .set i, (i +1)
193 .endr
194
195 add $(16*by), p_in
196
197 .if (klen == KEY_128)
198 vmovdqa 4*16(p_keys), xkeyB
199 .else
200 .if (load_keys)
201 vmovdqa 4*16(p_keys), xkey4
202 .endif
203 .endif
204
205 .set i, 0
206 .rept by
207 club XDATA, i
208 /* key 3 */
209 .if (klen == KEY_128)
210 vaesenc xkey4, var_xdata, var_xdata
211 .else
212 vaesenc xkeyA, var_xdata, var_xdata
213 .endif
214 .set i, (i +1)
215 .endr
216
217 vmovdqa 5*16(p_keys), xkeyA
218
219 .set i, 0
220 .rept by
221 club XDATA, i
222 /* key 4 */
223 .if (klen == KEY_128)
224 vaesenc xkeyB, var_xdata, var_xdata
225 .else
226 vaesenc xkey4, var_xdata, var_xdata
227 .endif
228 .set i, (i +1)
229 .endr
230
231 .if (klen == KEY_128)
232 .if (load_keys)
233 vmovdqa 6*16(p_keys), xkey8
234 .endif
235 .else
236 vmovdqa 6*16(p_keys), xkeyB
237 .endif
238
239 .set i, 0
240 .rept by
241 club XDATA, i
242 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
243 .set i, (i +1)
244 .endr
245
246 vmovdqa 7*16(p_keys), xkeyA
247
248 .set i, 0
249 .rept by
250 club XDATA, i
251 /* key 6 */
252 .if (klen == KEY_128)
253 vaesenc xkey8, var_xdata, var_xdata
254 .else
255 vaesenc xkeyB, var_xdata, var_xdata
256 .endif
257 .set i, (i +1)
258 .endr
259
260 .if (klen == KEY_128)
261 vmovdqa 8*16(p_keys), xkeyB
262 .else
263 .if (load_keys)
264 vmovdqa 8*16(p_keys), xkey8
265 .endif
266 .endif
267
268 .set i, 0
269 .rept by
270 club XDATA, i
271 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
272 .set i, (i +1)
273 .endr
274
275 .if (klen == KEY_128)
276 .if (load_keys)
277 vmovdqa 9*16(p_keys), xkey12
278 .endif
279 .else
280 vmovdqa 9*16(p_keys), xkeyA
281 .endif
282
283 .set i, 0
284 .rept by
285 club XDATA, i
286 /* key 8 */
287 .if (klen == KEY_128)
288 vaesenc xkeyB, var_xdata, var_xdata
289 .else
290 vaesenc xkey8, var_xdata, var_xdata
291 .endif
292 .set i, (i +1)
293 .endr
294
295 vmovdqa 10*16(p_keys), xkeyB
296
297 .set i, 0
298 .rept by
299 club XDATA, i
300 /* key 9 */
301 .if (klen == KEY_128)
302 vaesenc xkey12, var_xdata, var_xdata
303 .else
304 vaesenc xkeyA, var_xdata, var_xdata
305 .endif
306 .set i, (i +1)
307 .endr
308
309 .if (klen != KEY_128)
310 vmovdqa 11*16(p_keys), xkeyA
311 .endif
312
313 .set i, 0
314 .rept by
315 club XDATA, i
316 /* key 10 */
317 .if (klen == KEY_128)
318 vaesenclast xkeyB, var_xdata, var_xdata
319 .else
320 vaesenc xkeyB, var_xdata, var_xdata
321 .endif
322 .set i, (i +1)
323 .endr
324
325 .if (klen != KEY_128)
326 .if (load_keys)
327 vmovdqa 12*16(p_keys), xkey12
328 .endif
329
330 .set i, 0
331 .rept by
332 club XDATA, i
333 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
334 .set i, (i +1)
335 .endr
336
337 .if (klen == KEY_256)
338 vmovdqa 13*16(p_keys), xkeyA
339 .endif
340
341 .set i, 0
342 .rept by
343 club XDATA, i
344 .if (klen == KEY_256)
345 /* key 12 */
346 vaesenc xkey12, var_xdata, var_xdata
347 .else
348 vaesenclast xkey12, var_xdata, var_xdata
349 .endif
350 .set i, (i +1)
351 .endr
352
353 .if (klen == KEY_256)
354 vmovdqa 14*16(p_keys), xkeyB
355
356 .set i, 0
357 .rept by
358 club XDATA, i
359 /* key 13 */
360 vaesenc xkeyA, var_xdata, var_xdata
361 .set i, (i +1)
362 .endr
363
364 .set i, 0
365 .rept by
366 club XDATA, i
367 /* key 14 */
368 vaesenclast xkeyB, var_xdata, var_xdata
369 .set i, (i +1)
370 .endr
371 .endif
372 .endif
373
374 .set i, 0
375 .rept (by / 2)
376 .set j, (i+1)
377 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
378 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
379 club XDATA, i
380 vpxor xkeyA, var_xdata, var_xdata
381 club XDATA, j
382 vpxor xkeyB, var_xdata, var_xdata
383 .set i, (i+2)
384 .endr
385
386 .if (i < by)
387 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
388 club XDATA, i
389 vpxor xkeyA, var_xdata, var_xdata
390 .endif
391
392 .set i, 0
393 .rept by
394 club XDATA, i
395 VMOVDQ var_xdata, i*16(p_out)
396 .set i, (i+1)
397 .endr
398.endm
399
400.macro do_aes_load val, key_len, xctr
401 do_aes \val, 1, \key_len, \xctr
402.endm
403
404.macro do_aes_noload val, key_len, xctr
405 do_aes \val, 0, \key_len, \xctr
406.endm
407
408/* main body of aes ctr load */
409
410.macro do_aes_ctrmain key_len, xctr
411 cmp $16, num_bytes
412 jb .Ldo_return2\xctr\key_len
413
414 .if \xctr
415 shr $4, counter
416 vmovdqu (p_iv), xiv
417 .else
418 vmovdqa byteswap_const(%rip), xbyteswap
419 vmovdqu (p_iv), xcounter
420 vpshufb xbyteswap, xcounter, xcounter
421 .endif
422
423 mov num_bytes, tmp
424 and $(7*16), tmp
425 jz .Lmult_of_8_blks\xctr\key_len
426
427 /* 1 <= tmp <= 7 */
428 cmp $(4*16), tmp
429 jg .Lgt4\xctr\key_len
430 je .Leq4\xctr\key_len
431
432.Llt4\xctr\key_len:
433 cmp $(2*16), tmp
434 jg .Leq3\xctr\key_len
435 je .Leq2\xctr\key_len
436
437.Leq1\xctr\key_len:
438 do_aes_load 1, \key_len, \xctr
439 add $(1*16), p_out
440 and $(~7*16), num_bytes
441 jz .Ldo_return2\xctr\key_len
442 jmp .Lmain_loop2\xctr\key_len
443
444.Leq2\xctr\key_len:
445 do_aes_load 2, \key_len, \xctr
446 add $(2*16), p_out
447 and $(~7*16), num_bytes
448 jz .Ldo_return2\xctr\key_len
449 jmp .Lmain_loop2\xctr\key_len
450
451
452.Leq3\xctr\key_len:
453 do_aes_load 3, \key_len, \xctr
454 add $(3*16), p_out
455 and $(~7*16), num_bytes
456 jz .Ldo_return2\xctr\key_len
457 jmp .Lmain_loop2\xctr\key_len
458
459.Leq4\xctr\key_len:
460 do_aes_load 4, \key_len, \xctr
461 add $(4*16), p_out
462 and $(~7*16), num_bytes
463 jz .Ldo_return2\xctr\key_len
464 jmp .Lmain_loop2\xctr\key_len
465
466.Lgt4\xctr\key_len:
467 cmp $(6*16), tmp
468 jg .Leq7\xctr\key_len
469 je .Leq6\xctr\key_len
470
471.Leq5\xctr\key_len:
472 do_aes_load 5, \key_len, \xctr
473 add $(5*16), p_out
474 and $(~7*16), num_bytes
475 jz .Ldo_return2\xctr\key_len
476 jmp .Lmain_loop2\xctr\key_len
477
478.Leq6\xctr\key_len:
479 do_aes_load 6, \key_len, \xctr
480 add $(6*16), p_out
481 and $(~7*16), num_bytes
482 jz .Ldo_return2\xctr\key_len
483 jmp .Lmain_loop2\xctr\key_len
484
485.Leq7\xctr\key_len:
486 do_aes_load 7, \key_len, \xctr
487 add $(7*16), p_out
488 and $(~7*16), num_bytes
489 jz .Ldo_return2\xctr\key_len
490 jmp .Lmain_loop2\xctr\key_len
491
492.Lmult_of_8_blks\xctr\key_len:
493 .if (\key_len != KEY_128)
494 vmovdqa 0*16(p_keys), xkey0
495 vmovdqa 4*16(p_keys), xkey4
496 vmovdqa 8*16(p_keys), xkey8
497 vmovdqa 12*16(p_keys), xkey12
498 .else
499 vmovdqa 0*16(p_keys), xkey0
500 vmovdqa 3*16(p_keys), xkey4
501 vmovdqa 6*16(p_keys), xkey8
502 vmovdqa 9*16(p_keys), xkey12
503 .endif
504.align 16
505.Lmain_loop2\xctr\key_len:
506 /* num_bytes is a multiple of 8 and >0 */
507 do_aes_noload 8, \key_len, \xctr
508 add $(8*16), p_out
509 sub $(8*16), num_bytes
510 jne .Lmain_loop2\xctr\key_len
511
512.Ldo_return2\xctr\key_len:
513 .if !\xctr
514 /* return updated IV */
515 vpshufb xbyteswap, xcounter, xcounter
516 vmovdqu xcounter, (p_iv)
517 .endif
518 RET
519.endm
520
521/*
522 * routine to do AES128 CTR enc/decrypt "by8"
523 * XMM registers are clobbered.
524 * Saving/restoring must be done at a higher level
525 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
526 * unsigned int num_bytes)
527 */
528SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
529 /* call the aes main loop */
530 do_aes_ctrmain KEY_128 0
531
532SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
533
534/*
535 * routine to do AES192 CTR enc/decrypt "by8"
536 * XMM registers are clobbered.
537 * Saving/restoring must be done at a higher level
538 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
539 * unsigned int num_bytes)
540 */
541SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
542 /* call the aes main loop */
543 do_aes_ctrmain KEY_192 0
544
545SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
546
547/*
548 * routine to do AES256 CTR enc/decrypt "by8"
549 * XMM registers are clobbered.
550 * Saving/restoring must be done at a higher level
551 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
552 * unsigned int num_bytes)
553 */
554SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
555 /* call the aes main loop */
556 do_aes_ctrmain KEY_256 0
557
558SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
559
560/*
561 * routine to do AES128 XCTR enc/decrypt "by8"
562 * XMM registers are clobbered.
563 * Saving/restoring must be done at a higher level
564 * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
565 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
566 */
567SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
568 /* call the aes main loop */
569 do_aes_ctrmain KEY_128 1
570
571SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
572
573/*
574 * routine to do AES192 XCTR enc/decrypt "by8"
575 * XMM registers are clobbered.
576 * Saving/restoring must be done at a higher level
577 * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
578 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
579 */
580SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
581 /* call the aes main loop */
582 do_aes_ctrmain KEY_192 1
583
584SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
585
586/*
587 * routine to do AES256 XCTR enc/decrypt "by8"
588 * XMM registers are clobbered.
589 * Saving/restoring must be done at a higher level
590 * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
591 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
592 */
593SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
594 /* call the aes main loop */
595 do_aes_ctrmain KEY_256 1
596
597SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
598

source code of linux/arch/x86/crypto/aes_ctrby8_avx-x86_64.S