1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental |
4 | * SSE3 instruction set extensions introduced in Intel Core Microarchitecture |
5 | * processors. CPUs supporting Intel(R) AVX extensions will get an additional |
6 | * boost. |
7 | * |
8 | * This work was inspired by the vectorized implementation of Dean Gaudet. |
9 | * Additional information on it can be found at: |
10 | * http://www.arctic.org/~dean/crypto/sha1.html |
11 | * |
12 | * It was improved upon with more efficient vectorization of the message |
13 | * scheduling. This implementation has also been optimized for all current and |
14 | * several future generations of Intel CPUs. |
15 | * |
16 | * See this article for more information about the implementation details: |
17 | * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ |
18 | * |
19 | * Copyright (C) 2010, Intel Corp. |
20 | * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> |
21 | * Ronen Zohar <ronen.zohar@intel.com> |
22 | * |
23 | * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: |
24 | * Author: Mathias Krause <minipli@googlemail.com> |
25 | */ |
26 | |
27 | #include <linux/linkage.h> |
28 | #include <linux/cfi_types.h> |
29 | |
30 | #define CTX %rdi // arg1 |
31 | #define BUF %rsi // arg2 |
32 | #define CNT %rdx // arg3 |
33 | |
34 | #define REG_A %ecx |
35 | #define REG_B %esi |
36 | #define REG_C %edi |
37 | #define REG_D %r12d |
38 | #define REG_E %edx |
39 | |
40 | #define REG_T1 %eax |
41 | #define REG_T2 %ebx |
42 | |
43 | #define K_BASE %r8 |
44 | #define HASH_PTR %r9 |
45 | #define BUFFER_PTR %r10 |
46 | #define BUFFER_END %r11 |
47 | |
48 | #define W_TMP1 %xmm0 |
49 | #define W_TMP2 %xmm9 |
50 | |
51 | #define W0 %xmm1 |
52 | #define W4 %xmm2 |
53 | #define W8 %xmm3 |
54 | #define W12 %xmm4 |
55 | #define W16 %xmm5 |
56 | #define W20 %xmm6 |
57 | #define W24 %xmm7 |
58 | #define W28 %xmm8 |
59 | |
60 | #define XMM_SHUFB_BSWAP %xmm10 |
61 | |
62 | /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ |
63 | #define WK(t) (((t) & 15) * 4)(%rsp) |
64 | #define W_PRECALC_AHEAD 16 |
65 | |
66 | /* |
67 | * This macro implements the SHA-1 function's body for single 64-byte block |
68 | * param: function's name |
69 | */ |
70 | .macro SHA1_VECTOR_ASM name |
71 | SYM_TYPED_FUNC_START(\name) |
72 | |
73 | push %rbx |
74 | push %r12 |
75 | push %rbp |
76 | mov %rsp, %rbp |
77 | |
78 | sub $64, %rsp # allocate workspace |
79 | and $~15, %rsp # align stack |
80 | |
81 | mov CTX, HASH_PTR |
82 | mov BUF, BUFFER_PTR |
83 | |
84 | shl $6, CNT # multiply by 64 |
85 | add BUF, CNT |
86 | mov CNT, BUFFER_END |
87 | |
88 | lea K_XMM_AR(%rip), K_BASE |
89 | xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP |
90 | |
91 | SHA1_PIPELINED_MAIN_BODY |
92 | |
93 | # cleanup workspace |
94 | mov $8, %ecx |
95 | mov %rsp, %rdi |
96 | xor %eax, %eax |
97 | rep stosq |
98 | |
99 | mov %rbp, %rsp # deallocate workspace |
100 | pop %rbp |
101 | pop %r12 |
102 | pop %rbx |
103 | RET |
104 | |
105 | SYM_FUNC_END(\name) |
106 | .endm |
107 | |
108 | /* |
109 | * This macro implements 80 rounds of SHA-1 for one 64-byte block |
110 | */ |
111 | .macro SHA1_PIPELINED_MAIN_BODY |
112 | INIT_REGALLOC |
113 | |
114 | mov (HASH_PTR), A |
115 | mov 4(HASH_PTR), B |
116 | mov 8(HASH_PTR), C |
117 | mov 12(HASH_PTR), D |
118 | mov 16(HASH_PTR), E |
119 | |
120 | .set i, 0 |
121 | .rept W_PRECALC_AHEAD |
122 | W_PRECALC i |
123 | .set i, (i+1) |
124 | .endr |
125 | |
126 | .align 4 |
127 | 1: |
128 | RR F1,A,B,C,D,E,0 |
129 | RR F1,D,E,A,B,C,2 |
130 | RR F1,B,C,D,E,A,4 |
131 | RR F1,E,A,B,C,D,6 |
132 | RR F1,C,D,E,A,B,8 |
133 | |
134 | RR F1,A,B,C,D,E,10 |
135 | RR F1,D,E,A,B,C,12 |
136 | RR F1,B,C,D,E,A,14 |
137 | RR F1,E,A,B,C,D,16 |
138 | RR F1,C,D,E,A,B,18 |
139 | |
140 | RR F2,A,B,C,D,E,20 |
141 | RR F2,D,E,A,B,C,22 |
142 | RR F2,B,C,D,E,A,24 |
143 | RR F2,E,A,B,C,D,26 |
144 | RR F2,C,D,E,A,B,28 |
145 | |
146 | RR F2,A,B,C,D,E,30 |
147 | RR F2,D,E,A,B,C,32 |
148 | RR F2,B,C,D,E,A,34 |
149 | RR F2,E,A,B,C,D,36 |
150 | RR F2,C,D,E,A,B,38 |
151 | |
152 | RR F3,A,B,C,D,E,40 |
153 | RR F3,D,E,A,B,C,42 |
154 | RR F3,B,C,D,E,A,44 |
155 | RR F3,E,A,B,C,D,46 |
156 | RR F3,C,D,E,A,B,48 |
157 | |
158 | RR F3,A,B,C,D,E,50 |
159 | RR F3,D,E,A,B,C,52 |
160 | RR F3,B,C,D,E,A,54 |
161 | RR F3,E,A,B,C,D,56 |
162 | RR F3,C,D,E,A,B,58 |
163 | |
164 | add $64, BUFFER_PTR # move to the next 64-byte block |
165 | cmp BUFFER_END, BUFFER_PTR # if the current is the last one use |
166 | cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun |
167 | |
168 | RR F4,A,B,C,D,E,60 |
169 | RR F4,D,E,A,B,C,62 |
170 | RR F4,B,C,D,E,A,64 |
171 | RR F4,E,A,B,C,D,66 |
172 | RR F4,C,D,E,A,B,68 |
173 | |
174 | RR F4,A,B,C,D,E,70 |
175 | RR F4,D,E,A,B,C,72 |
176 | RR F4,B,C,D,E,A,74 |
177 | RR F4,E,A,B,C,D,76 |
178 | RR F4,C,D,E,A,B,78 |
179 | |
180 | UPDATE_HASH (HASH_PTR), A |
181 | UPDATE_HASH 4(HASH_PTR), B |
182 | UPDATE_HASH 8(HASH_PTR), C |
183 | UPDATE_HASH 12(HASH_PTR), D |
184 | UPDATE_HASH 16(HASH_PTR), E |
185 | |
186 | RESTORE_RENAMED_REGS |
187 | cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end |
188 | jne 1b |
189 | .endm |
190 | |
191 | .macro INIT_REGALLOC |
192 | .set A, REG_A |
193 | .set B, REG_B |
194 | .set C, REG_C |
195 | .set D, REG_D |
196 | .set E, REG_E |
197 | .set T1, REG_T1 |
198 | .set T2, REG_T2 |
199 | .endm |
200 | |
201 | .macro RESTORE_RENAMED_REGS |
202 | # order is important (REG_C is where it should be) |
203 | mov B, REG_B |
204 | mov D, REG_D |
205 | mov A, REG_A |
206 | mov E, REG_E |
207 | .endm |
208 | |
209 | .macro SWAP_REG_NAMES a, b |
210 | .set _T, \a |
211 | .set \a, \b |
212 | .set \b, _T |
213 | .endm |
214 | |
215 | .macro F1 b, c, d |
216 | mov \c, T1 |
217 | SWAP_REG_NAMES \c, T1 |
218 | xor \d, T1 |
219 | and \b, T1 |
220 | xor \d, T1 |
221 | .endm |
222 | |
223 | .macro F2 b, c, d |
224 | mov \d, T1 |
225 | SWAP_REG_NAMES \d, T1 |
226 | xor \c, T1 |
227 | xor \b, T1 |
228 | .endm |
229 | |
230 | .macro F3 b, c ,d |
231 | mov \c, T1 |
232 | SWAP_REG_NAMES \c, T1 |
233 | mov \b, T2 |
234 | or \b, T1 |
235 | and \c, T2 |
236 | and \d, T1 |
237 | or T2, T1 |
238 | .endm |
239 | |
240 | .macro F4 b, c, d |
241 | F2 \b, \c, \d |
242 | .endm |
243 | |
244 | .macro UPDATE_HASH hash, val |
245 | add \hash, \val |
246 | mov \val, \hash |
247 | .endm |
248 | |
249 | /* |
250 | * RR does two rounds of SHA-1 back to back with W[] pre-calc |
251 | * t1 = F(b, c, d); e += w(i) |
252 | * e += t1; b <<= 30; d += w(i+1); |
253 | * t1 = F(a, b, c); |
254 | * d += t1; a <<= 5; |
255 | * e += a; |
256 | * t1 = e; a >>= 7; |
257 | * t1 <<= 5; |
258 | * d += t1; |
259 | */ |
260 | .macro RR F, a, b, c, d, e, round |
261 | add WK(\round), \e |
262 | \F \b, \c, \d # t1 = F(b, c, d); |
263 | W_PRECALC (\round + W_PRECALC_AHEAD) |
264 | rol $30, \b |
265 | add T1, \e |
266 | add WK(\round + 1), \d |
267 | |
268 | \F \a, \b, \c |
269 | W_PRECALC (\round + W_PRECALC_AHEAD + 1) |
270 | rol $5, \a |
271 | add \a, \e |
272 | add T1, \d |
273 | ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) |
274 | |
275 | mov \e, T1 |
276 | SWAP_REG_NAMES \e, T1 |
277 | |
278 | rol $5, T1 |
279 | add T1, \d |
280 | |
281 | # write: \a, \b |
282 | # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c |
283 | .endm |
284 | |
285 | .macro W_PRECALC r |
286 | .set i, \r |
287 | |
288 | .if (i < 20) |
289 | .set K_XMM, 0 |
290 | .elseif (i < 40) |
291 | .set K_XMM, 16 |
292 | .elseif (i < 60) |
293 | .set K_XMM, 32 |
294 | .elseif (i < 80) |
295 | .set K_XMM, 48 |
296 | .endif |
297 | |
298 | .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) |
299 | .set i, ((\r) % 80) # pre-compute for the next iteration |
300 | .if (i == 0) |
301 | W_PRECALC_RESET |
302 | .endif |
303 | W_PRECALC_00_15 |
304 | .elseif (i<32) |
305 | W_PRECALC_16_31 |
306 | .elseif (i < 80) // rounds 32-79 |
307 | W_PRECALC_32_79 |
308 | .endif |
309 | .endm |
310 | |
311 | .macro W_PRECALC_RESET |
312 | .set W, W0 |
313 | .set W_minus_04, W4 |
314 | .set W_minus_08, W8 |
315 | .set W_minus_12, W12 |
316 | .set W_minus_16, W16 |
317 | .set W_minus_20, W20 |
318 | .set W_minus_24, W24 |
319 | .set W_minus_28, W28 |
320 | .set W_minus_32, W |
321 | .endm |
322 | |
323 | .macro W_PRECALC_ROTATE |
324 | .set W_minus_32, W_minus_28 |
325 | .set W_minus_28, W_minus_24 |
326 | .set W_minus_24, W_minus_20 |
327 | .set W_minus_20, W_minus_16 |
328 | .set W_minus_16, W_minus_12 |
329 | .set W_minus_12, W_minus_08 |
330 | .set W_minus_08, W_minus_04 |
331 | .set W_minus_04, W |
332 | .set W, W_minus_32 |
333 | .endm |
334 | |
335 | .macro W_PRECALC_SSSE3 |
336 | |
337 | .macro W_PRECALC_00_15 |
338 | W_PRECALC_00_15_SSSE3 |
339 | .endm |
340 | .macro W_PRECALC_16_31 |
341 | W_PRECALC_16_31_SSSE3 |
342 | .endm |
343 | .macro W_PRECALC_32_79 |
344 | W_PRECALC_32_79_SSSE3 |
345 | .endm |
346 | |
347 | /* message scheduling pre-compute for rounds 0-15 */ |
348 | .macro W_PRECALC_00_15_SSSE3 |
349 | .if ((i & 3) == 0) |
350 | movdqu (i*4)(BUFFER_PTR), W_TMP1 |
351 | .elseif ((i & 3) == 1) |
352 | pshufb XMM_SHUFB_BSWAP, W_TMP1 |
353 | movdqa W_TMP1, W |
354 | .elseif ((i & 3) == 2) |
355 | paddd (K_BASE), W_TMP1 |
356 | .elseif ((i & 3) == 3) |
357 | movdqa W_TMP1, WK(i&~3) |
358 | W_PRECALC_ROTATE |
359 | .endif |
360 | .endm |
361 | |
362 | /* message scheduling pre-compute for rounds 16-31 |
363 | * |
364 | * - calculating last 32 w[i] values in 8 XMM registers |
365 | * - pre-calculate K+w[i] values and store to mem, for later load by ALU add |
366 | * instruction |
367 | * |
368 | * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] |
369 | * dependency, but improves for 32-79 |
370 | */ |
371 | .macro W_PRECALC_16_31_SSSE3 |
372 | # blended scheduling of vector and scalar instruction streams, one 4-wide |
373 | # vector iteration / 4 scalar rounds |
374 | .if ((i & 3) == 0) |
375 | movdqa W_minus_12, W |
376 | palignr $8, W_minus_16, W # w[i-14] |
377 | movdqa W_minus_04, W_TMP1 |
378 | psrldq $4, W_TMP1 # w[i-3] |
379 | pxor W_minus_08, W |
380 | .elseif ((i & 3) == 1) |
381 | pxor W_minus_16, W_TMP1 |
382 | pxor W_TMP1, W |
383 | movdqa W, W_TMP2 |
384 | movdqa W, W_TMP1 |
385 | pslldq $12, W_TMP2 |
386 | .elseif ((i & 3) == 2) |
387 | psrld $31, W |
388 | pslld $1, W_TMP1 |
389 | por W, W_TMP1 |
390 | movdqa W_TMP2, W |
391 | psrld $30, W_TMP2 |
392 | pslld $2, W |
393 | .elseif ((i & 3) == 3) |
394 | pxor W, W_TMP1 |
395 | pxor W_TMP2, W_TMP1 |
396 | movdqa W_TMP1, W |
397 | paddd K_XMM(K_BASE), W_TMP1 |
398 | movdqa W_TMP1, WK(i&~3) |
399 | W_PRECALC_ROTATE |
400 | .endif |
401 | .endm |
402 | |
403 | /* message scheduling pre-compute for rounds 32-79 |
404 | * |
405 | * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 |
406 | * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 |
407 | * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken |
408 | */ |
409 | .macro W_PRECALC_32_79_SSSE3 |
410 | .if ((i & 3) == 0) |
411 | movdqa W_minus_04, W_TMP1 |
412 | pxor W_minus_28, W # W is W_minus_32 before xor |
413 | palignr $8, W_minus_08, W_TMP1 |
414 | .elseif ((i & 3) == 1) |
415 | pxor W_minus_16, W |
416 | pxor W_TMP1, W |
417 | movdqa W, W_TMP1 |
418 | .elseif ((i & 3) == 2) |
419 | psrld $30, W |
420 | pslld $2, W_TMP1 |
421 | por W, W_TMP1 |
422 | .elseif ((i & 3) == 3) |
423 | movdqa W_TMP1, W |
424 | paddd K_XMM(K_BASE), W_TMP1 |
425 | movdqa W_TMP1, WK(i&~3) |
426 | W_PRECALC_ROTATE |
427 | .endif |
428 | .endm |
429 | |
430 | .endm // W_PRECALC_SSSE3 |
431 | |
432 | |
433 | #define K1 0x5a827999 |
434 | #define K2 0x6ed9eba1 |
435 | #define K3 0x8f1bbcdc |
436 | #define K4 0xca62c1d6 |
437 | |
438 | .section .rodata |
439 | .align 16 |
440 | |
441 | K_XMM_AR: |
442 | .long K1, K1, K1, K1 |
443 | .long K2, K2, K2, K2 |
444 | .long K3, K3, K3, K3 |
445 | .long K4, K4, K4, K4 |
446 | |
447 | BSWAP_SHUFB_CTL: |
448 | .long 0x00010203 |
449 | .long 0x04050607 |
450 | .long 0x08090a0b |
451 | .long 0x0c0d0e0f |
452 | |
453 | |
454 | .section .text |
455 | |
456 | W_PRECALC_SSSE3 |
457 | .macro xmm_mov a, b |
458 | movdqu \a,\b |
459 | .endm |
460 | |
461 | /* |
462 | * SSSE3 optimized implementation: |
463 | * |
464 | * extern "C" void sha1_transform_ssse3(struct sha1_state *state, |
465 | * const u8 *data, int blocks); |
466 | * |
467 | * Note that struct sha1_state is assumed to begin with u32 state[5]. |
468 | */ |
469 | SHA1_VECTOR_ASM sha1_transform_ssse3 |
470 | |
471 | .macro W_PRECALC_AVX |
472 | |
473 | .purgem W_PRECALC_00_15 |
474 | .macro W_PRECALC_00_15 |
475 | W_PRECALC_00_15_AVX |
476 | .endm |
477 | .purgem W_PRECALC_16_31 |
478 | .macro W_PRECALC_16_31 |
479 | W_PRECALC_16_31_AVX |
480 | .endm |
481 | .purgem W_PRECALC_32_79 |
482 | .macro W_PRECALC_32_79 |
483 | W_PRECALC_32_79_AVX |
484 | .endm |
485 | |
486 | .macro W_PRECALC_00_15_AVX |
487 | .if ((i & 3) == 0) |
488 | vmovdqu (i*4)(BUFFER_PTR), W_TMP1 |
489 | .elseif ((i & 3) == 1) |
490 | vpshufb XMM_SHUFB_BSWAP, W_TMP1, W |
491 | .elseif ((i & 3) == 2) |
492 | vpaddd (K_BASE), W, W_TMP1 |
493 | .elseif ((i & 3) == 3) |
494 | vmovdqa W_TMP1, WK(i&~3) |
495 | W_PRECALC_ROTATE |
496 | .endif |
497 | .endm |
498 | |
499 | .macro W_PRECALC_16_31_AVX |
500 | .if ((i & 3) == 0) |
501 | vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] |
502 | vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] |
503 | vpxor W_minus_08, W, W |
504 | vpxor W_minus_16, W_TMP1, W_TMP1 |
505 | .elseif ((i & 3) == 1) |
506 | vpxor W_TMP1, W, W |
507 | vpslldq $12, W, W_TMP2 |
508 | vpslld $1, W, W_TMP1 |
509 | .elseif ((i & 3) == 2) |
510 | vpsrld $31, W, W |
511 | vpor W, W_TMP1, W_TMP1 |
512 | vpslld $2, W_TMP2, W |
513 | vpsrld $30, W_TMP2, W_TMP2 |
514 | .elseif ((i & 3) == 3) |
515 | vpxor W, W_TMP1, W_TMP1 |
516 | vpxor W_TMP2, W_TMP1, W |
517 | vpaddd K_XMM(K_BASE), W, W_TMP1 |
518 | vmovdqu W_TMP1, WK(i&~3) |
519 | W_PRECALC_ROTATE |
520 | .endif |
521 | .endm |
522 | |
523 | .macro W_PRECALC_32_79_AVX |
524 | .if ((i & 3) == 0) |
525 | vpalignr $8, W_minus_08, W_minus_04, W_TMP1 |
526 | vpxor W_minus_28, W, W # W is W_minus_32 before xor |
527 | .elseif ((i & 3) == 1) |
528 | vpxor W_minus_16, W_TMP1, W_TMP1 |
529 | vpxor W_TMP1, W, W |
530 | .elseif ((i & 3) == 2) |
531 | vpslld $2, W, W_TMP1 |
532 | vpsrld $30, W, W |
533 | vpor W, W_TMP1, W |
534 | .elseif ((i & 3) == 3) |
535 | vpaddd K_XMM(K_BASE), W, W_TMP1 |
536 | vmovdqu W_TMP1, WK(i&~3) |
537 | W_PRECALC_ROTATE |
538 | .endif |
539 | .endm |
540 | |
541 | .endm // W_PRECALC_AVX |
542 | |
543 | W_PRECALC_AVX |
544 | .purgem xmm_mov |
545 | .macro xmm_mov a, b |
546 | vmovdqu \a,\b |
547 | .endm |
548 | |
549 | |
550 | /* AVX optimized implementation: |
551 | * extern "C" void sha1_transform_avx(struct sha1_state *state, |
552 | * const u8 *data, int blocks); |
553 | */ |
554 | SHA1_VECTOR_ASM sha1_transform_avx |
555 | |