1/*
2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define VMOVDQ vmovdqu
69
70#define xdata0 %xmm0
71#define xdata1 %xmm1
72#define xdata2 %xmm2
73#define xdata3 %xmm3
74#define xdata4 %xmm4
75#define xdata5 %xmm5
76#define xdata6 %xmm6
77#define xdata7 %xmm7
78#define xcounter %xmm8
79#define xbyteswap %xmm9
80#define xkey0 %xmm10
81#define xkey4 %xmm11
82#define xkey8 %xmm12
83#define xkey12 %xmm13
84#define xkeyA %xmm14
85#define xkeyB %xmm15
86
87#define p_in %rdi
88#define p_iv %rsi
89#define p_keys %rdx
90#define p_out %rcx
91#define num_bytes %r8
92
93#define tmp %r10
94#define DDQ_DATA 0
95#define XDATA 1
96#define KEY_128 1
97#define KEY_192 2
98#define KEY_256 3
99
100.section .rodata
101.align 16
102
103byteswap_const:
104 .octa 0x000102030405060708090A0B0C0D0E0F
105ddq_low_msk:
106 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
107ddq_high_add_1:
108 .octa 0x00000000000000010000000000000000
109ddq_add_1:
110 .octa 0x00000000000000000000000000000001
111ddq_add_2:
112 .octa 0x00000000000000000000000000000002
113ddq_add_3:
114 .octa 0x00000000000000000000000000000003
115ddq_add_4:
116 .octa 0x00000000000000000000000000000004
117ddq_add_5:
118 .octa 0x00000000000000000000000000000005
119ddq_add_6:
120 .octa 0x00000000000000000000000000000006
121ddq_add_7:
122 .octa 0x00000000000000000000000000000007
123ddq_add_8:
124 .octa 0x00000000000000000000000000000008
125
126.text
127
128/* generate a unique variable for ddq_add_x */
129
130.macro setddq n
131 var_ddq_add = ddq_add_\n
132.endm
133
134/* generate a unique variable for xmm register */
135.macro setxdata n
136 var_xdata = %xmm\n
137.endm
138
139/* club the numeric 'id' to the symbol 'name' */
140
141.macro club name, id
142.altmacro
143 .if \name == DDQ_DATA
144 setddq %\id
145 .elseif \name == XDATA
146 setxdata %\id
147 .endif
148.noaltmacro
149.endm
150
151/*
152 * do_aes num_in_par load_keys key_len
153 * This increments p_in, but not p_out
154 */
155.macro do_aes b, k, key_len
156 .set by, \b
157 .set load_keys, \k
158 .set klen, \key_len
159
160 .if (load_keys)
161 vmovdqa 0*16(p_keys), xkey0
162 .endif
163
164 vpshufb xbyteswap, xcounter, xdata0
165
166 .set i, 1
167 .rept (by - 1)
168 club DDQ_DATA, i
169 club XDATA, i
170 vpaddq var_ddq_add(%rip), xcounter, var_xdata
171 vptest ddq_low_msk(%rip), var_xdata
172 jnz 1f
173 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
174 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
175 1:
176 vpshufb xbyteswap, var_xdata, var_xdata
177 .set i, (i +1)
178 .endr
179
180 vmovdqa 1*16(p_keys), xkeyA
181
182 vpxor xkey0, xdata0, xdata0
183 club DDQ_DATA, by
184 vpaddq var_ddq_add(%rip), xcounter, xcounter
185 vptest ddq_low_msk(%rip), xcounter
186 jnz 1f
187 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
188 1:
189
190 .set i, 1
191 .rept (by - 1)
192 club XDATA, i
193 vpxor xkey0, var_xdata, var_xdata
194 .set i, (i +1)
195 .endr
196
197 vmovdqa 2*16(p_keys), xkeyB
198
199 .set i, 0
200 .rept by
201 club XDATA, i
202 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
203 .set i, (i +1)
204 .endr
205
206 .if (klen == KEY_128)
207 .if (load_keys)
208 vmovdqa 3*16(p_keys), xkey4
209 .endif
210 .else
211 vmovdqa 3*16(p_keys), xkeyA
212 .endif
213
214 .set i, 0
215 .rept by
216 club XDATA, i
217 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
218 .set i, (i +1)
219 .endr
220
221 add $(16*by), p_in
222
223 .if (klen == KEY_128)
224 vmovdqa 4*16(p_keys), xkeyB
225 .else
226 .if (load_keys)
227 vmovdqa 4*16(p_keys), xkey4
228 .endif
229 .endif
230
231 .set i, 0
232 .rept by
233 club XDATA, i
234 /* key 3 */
235 .if (klen == KEY_128)
236 vaesenc xkey4, var_xdata, var_xdata
237 .else
238 vaesenc xkeyA, var_xdata, var_xdata
239 .endif
240 .set i, (i +1)
241 .endr
242
243 vmovdqa 5*16(p_keys), xkeyA
244
245 .set i, 0
246 .rept by
247 club XDATA, i
248 /* key 4 */
249 .if (klen == KEY_128)
250 vaesenc xkeyB, var_xdata, var_xdata
251 .else
252 vaesenc xkey4, var_xdata, var_xdata
253 .endif
254 .set i, (i +1)
255 .endr
256
257 .if (klen == KEY_128)
258 .if (load_keys)
259 vmovdqa 6*16(p_keys), xkey8
260 .endif
261 .else
262 vmovdqa 6*16(p_keys), xkeyB
263 .endif
264
265 .set i, 0
266 .rept by
267 club XDATA, i
268 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
269 .set i, (i +1)
270 .endr
271
272 vmovdqa 7*16(p_keys), xkeyA
273
274 .set i, 0
275 .rept by
276 club XDATA, i
277 /* key 6 */
278 .if (klen == KEY_128)
279 vaesenc xkey8, var_xdata, var_xdata
280 .else
281 vaesenc xkeyB, var_xdata, var_xdata
282 .endif
283 .set i, (i +1)
284 .endr
285
286 .if (klen == KEY_128)
287 vmovdqa 8*16(p_keys), xkeyB
288 .else
289 .if (load_keys)
290 vmovdqa 8*16(p_keys), xkey8
291 .endif
292 .endif
293
294 .set i, 0
295 .rept by
296 club XDATA, i
297 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
298 .set i, (i +1)
299 .endr
300
301 .if (klen == KEY_128)
302 .if (load_keys)
303 vmovdqa 9*16(p_keys), xkey12
304 .endif
305 .else
306 vmovdqa 9*16(p_keys), xkeyA
307 .endif
308
309 .set i, 0
310 .rept by
311 club XDATA, i
312 /* key 8 */
313 .if (klen == KEY_128)
314 vaesenc xkeyB, var_xdata, var_xdata
315 .else
316 vaesenc xkey8, var_xdata, var_xdata
317 .endif
318 .set i, (i +1)
319 .endr
320
321 vmovdqa 10*16(p_keys), xkeyB
322
323 .set i, 0
324 .rept by
325 club XDATA, i
326 /* key 9 */
327 .if (klen == KEY_128)
328 vaesenc xkey12, var_xdata, var_xdata
329 .else
330 vaesenc xkeyA, var_xdata, var_xdata
331 .endif
332 .set i, (i +1)
333 .endr
334
335 .if (klen != KEY_128)
336 vmovdqa 11*16(p_keys), xkeyA
337 .endif
338
339 .set i, 0
340 .rept by
341 club XDATA, i
342 /* key 10 */
343 .if (klen == KEY_128)
344 vaesenclast xkeyB, var_xdata, var_xdata
345 .else
346 vaesenc xkeyB, var_xdata, var_xdata
347 .endif
348 .set i, (i +1)
349 .endr
350
351 .if (klen != KEY_128)
352 .if (load_keys)
353 vmovdqa 12*16(p_keys), xkey12
354 .endif
355
356 .set i, 0
357 .rept by
358 club XDATA, i
359 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
360 .set i, (i +1)
361 .endr
362
363 .if (klen == KEY_256)
364 vmovdqa 13*16(p_keys), xkeyA
365 .endif
366
367 .set i, 0
368 .rept by
369 club XDATA, i
370 .if (klen == KEY_256)
371 /* key 12 */
372 vaesenc xkey12, var_xdata, var_xdata
373 .else
374 vaesenclast xkey12, var_xdata, var_xdata
375 .endif
376 .set i, (i +1)
377 .endr
378
379 .if (klen == KEY_256)
380 vmovdqa 14*16(p_keys), xkeyB
381
382 .set i, 0
383 .rept by
384 club XDATA, i
385 /* key 13 */
386 vaesenc xkeyA, var_xdata, var_xdata
387 .set i, (i +1)
388 .endr
389
390 .set i, 0
391 .rept by
392 club XDATA, i
393 /* key 14 */
394 vaesenclast xkeyB, var_xdata, var_xdata
395 .set i, (i +1)
396 .endr
397 .endif
398 .endif
399
400 .set i, 0
401 .rept (by / 2)
402 .set j, (i+1)
403 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
404 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
405 club XDATA, i
406 vpxor xkeyA, var_xdata, var_xdata
407 club XDATA, j
408 vpxor xkeyB, var_xdata, var_xdata
409 .set i, (i+2)
410 .endr
411
412 .if (i < by)
413 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
414 club XDATA, i
415 vpxor xkeyA, var_xdata, var_xdata
416 .endif
417
418 .set i, 0
419 .rept by
420 club XDATA, i
421 VMOVDQ var_xdata, i*16(p_out)
422 .set i, (i+1)
423 .endr
424.endm
425
426.macro do_aes_load val, key_len
427 do_aes \val, 1, \key_len
428.endm
429
430.macro do_aes_noload val, key_len
431 do_aes \val, 0, \key_len
432.endm
433
434/* main body of aes ctr load */
435
436.macro do_aes_ctrmain key_len
437 cmp $16, num_bytes
438 jb .Ldo_return2\key_len
439
440 vmovdqa byteswap_const(%rip), xbyteswap
441 vmovdqu (p_iv), xcounter
442 vpshufb xbyteswap, xcounter, xcounter
443
444 mov num_bytes, tmp
445 and $(7*16), tmp
446 jz .Lmult_of_8_blks\key_len
447
448 /* 1 <= tmp <= 7 */
449 cmp $(4*16), tmp
450 jg .Lgt4\key_len
451 je .Leq4\key_len
452
453.Llt4\key_len:
454 cmp $(2*16), tmp
455 jg .Leq3\key_len
456 je .Leq2\key_len
457
458.Leq1\key_len:
459 do_aes_load 1, \key_len
460 add $(1*16), p_out
461 and $(~7*16), num_bytes
462 jz .Ldo_return2\key_len
463 jmp .Lmain_loop2\key_len
464
465.Leq2\key_len:
466 do_aes_load 2, \key_len
467 add $(2*16), p_out
468 and $(~7*16), num_bytes
469 jz .Ldo_return2\key_len
470 jmp .Lmain_loop2\key_len
471
472
473.Leq3\key_len:
474 do_aes_load 3, \key_len
475 add $(3*16), p_out
476 and $(~7*16), num_bytes
477 jz .Ldo_return2\key_len
478 jmp .Lmain_loop2\key_len
479
480.Leq4\key_len:
481 do_aes_load 4, \key_len
482 add $(4*16), p_out
483 and $(~7*16), num_bytes
484 jz .Ldo_return2\key_len
485 jmp .Lmain_loop2\key_len
486
487.Lgt4\key_len:
488 cmp $(6*16), tmp
489 jg .Leq7\key_len
490 je .Leq6\key_len
491
492.Leq5\key_len:
493 do_aes_load 5, \key_len
494 add $(5*16), p_out
495 and $(~7*16), num_bytes
496 jz .Ldo_return2\key_len
497 jmp .Lmain_loop2\key_len
498
499.Leq6\key_len:
500 do_aes_load 6, \key_len
501 add $(6*16), p_out
502 and $(~7*16), num_bytes
503 jz .Ldo_return2\key_len
504 jmp .Lmain_loop2\key_len
505
506.Leq7\key_len:
507 do_aes_load 7, \key_len
508 add $(7*16), p_out
509 and $(~7*16), num_bytes
510 jz .Ldo_return2\key_len
511 jmp .Lmain_loop2\key_len
512
513.Lmult_of_8_blks\key_len:
514 .if (\key_len != KEY_128)
515 vmovdqa 0*16(p_keys), xkey0
516 vmovdqa 4*16(p_keys), xkey4
517 vmovdqa 8*16(p_keys), xkey8
518 vmovdqa 12*16(p_keys), xkey12
519 .else
520 vmovdqa 0*16(p_keys), xkey0
521 vmovdqa 3*16(p_keys), xkey4
522 vmovdqa 6*16(p_keys), xkey8
523 vmovdqa 9*16(p_keys), xkey12
524 .endif
525.align 16
526.Lmain_loop2\key_len:
527 /* num_bytes is a multiple of 8 and >0 */
528 do_aes_noload 8, \key_len
529 add $(8*16), p_out
530 sub $(8*16), num_bytes
531 jne .Lmain_loop2\key_len
532
533.Ldo_return2\key_len:
534 /* return updated IV */
535 vpshufb xbyteswap, xcounter, xcounter
536 vmovdqu xcounter, (p_iv)
537 ret
538.endm
539
540/*
541 * routine to do AES128 CTR enc/decrypt "by8"
542 * XMM registers are clobbered.
543 * Saving/restoring must be done at a higher level
544 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
545 * unsigned int num_bytes)
546 */
547ENTRY(aes_ctr_enc_128_avx_by8)
548 /* call the aes main loop */
549 do_aes_ctrmain KEY_128
550
551ENDPROC(aes_ctr_enc_128_avx_by8)
552
553/*
554 * routine to do AES192 CTR enc/decrypt "by8"
555 * XMM registers are clobbered.
556 * Saving/restoring must be done at a higher level
557 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
558 * unsigned int num_bytes)
559 */
560ENTRY(aes_ctr_enc_192_avx_by8)
561 /* call the aes main loop */
562 do_aes_ctrmain KEY_192
563
564ENDPROC(aes_ctr_enc_192_avx_by8)
565
566/*
567 * routine to do AES256 CTR enc/decrypt "by8"
568 * XMM registers are clobbered.
569 * Saving/restoring must be done at a higher level
570 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
571 * unsigned int num_bytes)
572 */
573ENTRY(aes_ctr_enc_256_avx_by8)
574 /* call the aes main loop */
575 do_aes_ctrmain KEY_256
576
577ENDPROC(aes_ctr_enc_256_avx_by8)
578