1 | /* SPDX-License-Identifier: GPL-2.0+ */ |
2 | /* |
3 | * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions |
4 | * |
5 | * Copyright (C) 2018 Martin Willi |
6 | */ |
7 | |
8 | #include <linux/linkage.h> |
9 | |
10 | .section .rodata.cst32.CTR2BL, "aM" , @progbits, 32 |
11 | .align 32 |
12 | CTR2BL: .octa 0x00000000000000000000000000000000 |
13 | .octa 0x00000000000000000000000000000001 |
14 | |
15 | .section .rodata.cst32.CTR4BL, "aM" , @progbits, 32 |
16 | .align 32 |
17 | CTR4BL: .octa 0x00000000000000000000000000000002 |
18 | .octa 0x00000000000000000000000000000003 |
19 | |
20 | .section .rodata.cst32.CTR8BL, "aM" , @progbits, 32 |
21 | .align 32 |
22 | CTR8BL: .octa 0x00000003000000020000000100000000 |
23 | .octa 0x00000007000000060000000500000004 |
24 | |
25 | .text |
26 | |
27 | SYM_FUNC_START(chacha_2block_xor_avx512vl) |
28 | # %rdi: Input state matrix, s |
29 | # %rsi: up to 2 data blocks output, o |
30 | # %rdx: up to 2 data blocks input, i |
31 | # %rcx: input/output length in bytes |
32 | # %r8d: nrounds |
33 | |
34 | # This function encrypts two ChaCha blocks by loading the state |
35 | # matrix twice across four AVX registers. It performs matrix operations |
36 | # on four words in each matrix in parallel, but requires shuffling to |
37 | # rearrange the words after each round. |
38 | |
39 | vzeroupper |
40 | |
41 | # x0..3[0-2] = s0..3 |
42 | vbroadcasti128 0x00(%rdi),%ymm0 |
43 | vbroadcasti128 0x10(%rdi),%ymm1 |
44 | vbroadcasti128 0x20(%rdi),%ymm2 |
45 | vbroadcasti128 0x30(%rdi),%ymm3 |
46 | |
47 | vpaddd CTR2BL(%rip),%ymm3,%ymm3 |
48 | |
49 | vmovdqa %ymm0,%ymm8 |
50 | vmovdqa %ymm1,%ymm9 |
51 | vmovdqa %ymm2,%ymm10 |
52 | vmovdqa %ymm3,%ymm11 |
53 | |
54 | .Ldoubleround: |
55 | |
56 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
57 | vpaddd %ymm1,%ymm0,%ymm0 |
58 | vpxord %ymm0,%ymm3,%ymm3 |
59 | vprold $16,%ymm3,%ymm3 |
60 | |
61 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
62 | vpaddd %ymm3,%ymm2,%ymm2 |
63 | vpxord %ymm2,%ymm1,%ymm1 |
64 | vprold $12,%ymm1,%ymm1 |
65 | |
66 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
67 | vpaddd %ymm1,%ymm0,%ymm0 |
68 | vpxord %ymm0,%ymm3,%ymm3 |
69 | vprold $8,%ymm3,%ymm3 |
70 | |
71 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
72 | vpaddd %ymm3,%ymm2,%ymm2 |
73 | vpxord %ymm2,%ymm1,%ymm1 |
74 | vprold $7,%ymm1,%ymm1 |
75 | |
76 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
77 | vpshufd $0x39,%ymm1,%ymm1 |
78 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
79 | vpshufd $0x4e,%ymm2,%ymm2 |
80 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
81 | vpshufd $0x93,%ymm3,%ymm3 |
82 | |
83 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
84 | vpaddd %ymm1,%ymm0,%ymm0 |
85 | vpxord %ymm0,%ymm3,%ymm3 |
86 | vprold $16,%ymm3,%ymm3 |
87 | |
88 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
89 | vpaddd %ymm3,%ymm2,%ymm2 |
90 | vpxord %ymm2,%ymm1,%ymm1 |
91 | vprold $12,%ymm1,%ymm1 |
92 | |
93 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
94 | vpaddd %ymm1,%ymm0,%ymm0 |
95 | vpxord %ymm0,%ymm3,%ymm3 |
96 | vprold $8,%ymm3,%ymm3 |
97 | |
98 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
99 | vpaddd %ymm3,%ymm2,%ymm2 |
100 | vpxord %ymm2,%ymm1,%ymm1 |
101 | vprold $7,%ymm1,%ymm1 |
102 | |
103 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
104 | vpshufd $0x93,%ymm1,%ymm1 |
105 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
106 | vpshufd $0x4e,%ymm2,%ymm2 |
107 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
108 | vpshufd $0x39,%ymm3,%ymm3 |
109 | |
110 | sub $2,%r8d |
111 | jnz .Ldoubleround |
112 | |
113 | # o0 = i0 ^ (x0 + s0) |
114 | vpaddd %ymm8,%ymm0,%ymm7 |
115 | cmp $0x10,%rcx |
116 | jl .Lxorpart2 |
117 | vpxord 0x00(%rdx),%xmm7,%xmm6 |
118 | vmovdqu %xmm6,0x00(%rsi) |
119 | vextracti128 $1,%ymm7,%xmm0 |
120 | # o1 = i1 ^ (x1 + s1) |
121 | vpaddd %ymm9,%ymm1,%ymm7 |
122 | cmp $0x20,%rcx |
123 | jl .Lxorpart2 |
124 | vpxord 0x10(%rdx),%xmm7,%xmm6 |
125 | vmovdqu %xmm6,0x10(%rsi) |
126 | vextracti128 $1,%ymm7,%xmm1 |
127 | # o2 = i2 ^ (x2 + s2) |
128 | vpaddd %ymm10,%ymm2,%ymm7 |
129 | cmp $0x30,%rcx |
130 | jl .Lxorpart2 |
131 | vpxord 0x20(%rdx),%xmm7,%xmm6 |
132 | vmovdqu %xmm6,0x20(%rsi) |
133 | vextracti128 $1,%ymm7,%xmm2 |
134 | # o3 = i3 ^ (x3 + s3) |
135 | vpaddd %ymm11,%ymm3,%ymm7 |
136 | cmp $0x40,%rcx |
137 | jl .Lxorpart2 |
138 | vpxord 0x30(%rdx),%xmm7,%xmm6 |
139 | vmovdqu %xmm6,0x30(%rsi) |
140 | vextracti128 $1,%ymm7,%xmm3 |
141 | |
142 | # xor and write second block |
143 | vmovdqa %xmm0,%xmm7 |
144 | cmp $0x50,%rcx |
145 | jl .Lxorpart2 |
146 | vpxord 0x40(%rdx),%xmm7,%xmm6 |
147 | vmovdqu %xmm6,0x40(%rsi) |
148 | |
149 | vmovdqa %xmm1,%xmm7 |
150 | cmp $0x60,%rcx |
151 | jl .Lxorpart2 |
152 | vpxord 0x50(%rdx),%xmm7,%xmm6 |
153 | vmovdqu %xmm6,0x50(%rsi) |
154 | |
155 | vmovdqa %xmm2,%xmm7 |
156 | cmp $0x70,%rcx |
157 | jl .Lxorpart2 |
158 | vpxord 0x60(%rdx),%xmm7,%xmm6 |
159 | vmovdqu %xmm6,0x60(%rsi) |
160 | |
161 | vmovdqa %xmm3,%xmm7 |
162 | cmp $0x80,%rcx |
163 | jl .Lxorpart2 |
164 | vpxord 0x70(%rdx),%xmm7,%xmm6 |
165 | vmovdqu %xmm6,0x70(%rsi) |
166 | |
167 | .Ldone2: |
168 | vzeroupper |
169 | RET |
170 | |
171 | .Lxorpart2: |
172 | # xor remaining bytes from partial register into output |
173 | mov %rcx,%rax |
174 | and $0xf,%rcx |
175 | jz .Ldone2 |
176 | mov %rax,%r9 |
177 | and $~0xf,%r9 |
178 | |
179 | mov $1,%rax |
180 | shld %cl,%rax,%rax |
181 | sub $1,%rax |
182 | kmovq %rax,%k1 |
183 | |
184 | vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} |
185 | vpxord %xmm7,%xmm1,%xmm1 |
186 | vmovdqu8 %xmm1,(%rsi,%r9){%k1} |
187 | |
188 | jmp .Ldone2 |
189 | |
190 | SYM_FUNC_END(chacha_2block_xor_avx512vl) |
191 | |
192 | SYM_FUNC_START(chacha_4block_xor_avx512vl) |
193 | # %rdi: Input state matrix, s |
194 | # %rsi: up to 4 data blocks output, o |
195 | # %rdx: up to 4 data blocks input, i |
196 | # %rcx: input/output length in bytes |
197 | # %r8d: nrounds |
198 | |
199 | # This function encrypts four ChaCha blocks by loading the state |
200 | # matrix four times across eight AVX registers. It performs matrix |
201 | # operations on four words in two matrices in parallel, sequentially |
202 | # to the operations on the four words of the other two matrices. The |
203 | # required word shuffling has a rather high latency, we can do the |
204 | # arithmetic on two matrix-pairs without much slowdown. |
205 | |
206 | vzeroupper |
207 | |
208 | # x0..3[0-4] = s0..3 |
209 | vbroadcasti128 0x00(%rdi),%ymm0 |
210 | vbroadcasti128 0x10(%rdi),%ymm1 |
211 | vbroadcasti128 0x20(%rdi),%ymm2 |
212 | vbroadcasti128 0x30(%rdi),%ymm3 |
213 | |
214 | vmovdqa %ymm0,%ymm4 |
215 | vmovdqa %ymm1,%ymm5 |
216 | vmovdqa %ymm2,%ymm6 |
217 | vmovdqa %ymm3,%ymm7 |
218 | |
219 | vpaddd CTR2BL(%rip),%ymm3,%ymm3 |
220 | vpaddd CTR4BL(%rip),%ymm7,%ymm7 |
221 | |
222 | vmovdqa %ymm0,%ymm11 |
223 | vmovdqa %ymm1,%ymm12 |
224 | vmovdqa %ymm2,%ymm13 |
225 | vmovdqa %ymm3,%ymm14 |
226 | vmovdqa %ymm7,%ymm15 |
227 | |
228 | .Ldoubleround4: |
229 | |
230 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
231 | vpaddd %ymm1,%ymm0,%ymm0 |
232 | vpxord %ymm0,%ymm3,%ymm3 |
233 | vprold $16,%ymm3,%ymm3 |
234 | |
235 | vpaddd %ymm5,%ymm4,%ymm4 |
236 | vpxord %ymm4,%ymm7,%ymm7 |
237 | vprold $16,%ymm7,%ymm7 |
238 | |
239 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
240 | vpaddd %ymm3,%ymm2,%ymm2 |
241 | vpxord %ymm2,%ymm1,%ymm1 |
242 | vprold $12,%ymm1,%ymm1 |
243 | |
244 | vpaddd %ymm7,%ymm6,%ymm6 |
245 | vpxord %ymm6,%ymm5,%ymm5 |
246 | vprold $12,%ymm5,%ymm5 |
247 | |
248 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
249 | vpaddd %ymm1,%ymm0,%ymm0 |
250 | vpxord %ymm0,%ymm3,%ymm3 |
251 | vprold $8,%ymm3,%ymm3 |
252 | |
253 | vpaddd %ymm5,%ymm4,%ymm4 |
254 | vpxord %ymm4,%ymm7,%ymm7 |
255 | vprold $8,%ymm7,%ymm7 |
256 | |
257 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
258 | vpaddd %ymm3,%ymm2,%ymm2 |
259 | vpxord %ymm2,%ymm1,%ymm1 |
260 | vprold $7,%ymm1,%ymm1 |
261 | |
262 | vpaddd %ymm7,%ymm6,%ymm6 |
263 | vpxord %ymm6,%ymm5,%ymm5 |
264 | vprold $7,%ymm5,%ymm5 |
265 | |
266 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
267 | vpshufd $0x39,%ymm1,%ymm1 |
268 | vpshufd $0x39,%ymm5,%ymm5 |
269 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
270 | vpshufd $0x4e,%ymm2,%ymm2 |
271 | vpshufd $0x4e,%ymm6,%ymm6 |
272 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
273 | vpshufd $0x93,%ymm3,%ymm3 |
274 | vpshufd $0x93,%ymm7,%ymm7 |
275 | |
276 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
277 | vpaddd %ymm1,%ymm0,%ymm0 |
278 | vpxord %ymm0,%ymm3,%ymm3 |
279 | vprold $16,%ymm3,%ymm3 |
280 | |
281 | vpaddd %ymm5,%ymm4,%ymm4 |
282 | vpxord %ymm4,%ymm7,%ymm7 |
283 | vprold $16,%ymm7,%ymm7 |
284 | |
285 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
286 | vpaddd %ymm3,%ymm2,%ymm2 |
287 | vpxord %ymm2,%ymm1,%ymm1 |
288 | vprold $12,%ymm1,%ymm1 |
289 | |
290 | vpaddd %ymm7,%ymm6,%ymm6 |
291 | vpxord %ymm6,%ymm5,%ymm5 |
292 | vprold $12,%ymm5,%ymm5 |
293 | |
294 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
295 | vpaddd %ymm1,%ymm0,%ymm0 |
296 | vpxord %ymm0,%ymm3,%ymm3 |
297 | vprold $8,%ymm3,%ymm3 |
298 | |
299 | vpaddd %ymm5,%ymm4,%ymm4 |
300 | vpxord %ymm4,%ymm7,%ymm7 |
301 | vprold $8,%ymm7,%ymm7 |
302 | |
303 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
304 | vpaddd %ymm3,%ymm2,%ymm2 |
305 | vpxord %ymm2,%ymm1,%ymm1 |
306 | vprold $7,%ymm1,%ymm1 |
307 | |
308 | vpaddd %ymm7,%ymm6,%ymm6 |
309 | vpxord %ymm6,%ymm5,%ymm5 |
310 | vprold $7,%ymm5,%ymm5 |
311 | |
312 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
313 | vpshufd $0x93,%ymm1,%ymm1 |
314 | vpshufd $0x93,%ymm5,%ymm5 |
315 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
316 | vpshufd $0x4e,%ymm2,%ymm2 |
317 | vpshufd $0x4e,%ymm6,%ymm6 |
318 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
319 | vpshufd $0x39,%ymm3,%ymm3 |
320 | vpshufd $0x39,%ymm7,%ymm7 |
321 | |
322 | sub $2,%r8d |
323 | jnz .Ldoubleround4 |
324 | |
325 | # o0 = i0 ^ (x0 + s0), first block |
326 | vpaddd %ymm11,%ymm0,%ymm10 |
327 | cmp $0x10,%rcx |
328 | jl .Lxorpart4 |
329 | vpxord 0x00(%rdx),%xmm10,%xmm9 |
330 | vmovdqu %xmm9,0x00(%rsi) |
331 | vextracti128 $1,%ymm10,%xmm0 |
332 | # o1 = i1 ^ (x1 + s1), first block |
333 | vpaddd %ymm12,%ymm1,%ymm10 |
334 | cmp $0x20,%rcx |
335 | jl .Lxorpart4 |
336 | vpxord 0x10(%rdx),%xmm10,%xmm9 |
337 | vmovdqu %xmm9,0x10(%rsi) |
338 | vextracti128 $1,%ymm10,%xmm1 |
339 | # o2 = i2 ^ (x2 + s2), first block |
340 | vpaddd %ymm13,%ymm2,%ymm10 |
341 | cmp $0x30,%rcx |
342 | jl .Lxorpart4 |
343 | vpxord 0x20(%rdx),%xmm10,%xmm9 |
344 | vmovdqu %xmm9,0x20(%rsi) |
345 | vextracti128 $1,%ymm10,%xmm2 |
346 | # o3 = i3 ^ (x3 + s3), first block |
347 | vpaddd %ymm14,%ymm3,%ymm10 |
348 | cmp $0x40,%rcx |
349 | jl .Lxorpart4 |
350 | vpxord 0x30(%rdx),%xmm10,%xmm9 |
351 | vmovdqu %xmm9,0x30(%rsi) |
352 | vextracti128 $1,%ymm10,%xmm3 |
353 | |
354 | # xor and write second block |
355 | vmovdqa %xmm0,%xmm10 |
356 | cmp $0x50,%rcx |
357 | jl .Lxorpart4 |
358 | vpxord 0x40(%rdx),%xmm10,%xmm9 |
359 | vmovdqu %xmm9,0x40(%rsi) |
360 | |
361 | vmovdqa %xmm1,%xmm10 |
362 | cmp $0x60,%rcx |
363 | jl .Lxorpart4 |
364 | vpxord 0x50(%rdx),%xmm10,%xmm9 |
365 | vmovdqu %xmm9,0x50(%rsi) |
366 | |
367 | vmovdqa %xmm2,%xmm10 |
368 | cmp $0x70,%rcx |
369 | jl .Lxorpart4 |
370 | vpxord 0x60(%rdx),%xmm10,%xmm9 |
371 | vmovdqu %xmm9,0x60(%rsi) |
372 | |
373 | vmovdqa %xmm3,%xmm10 |
374 | cmp $0x80,%rcx |
375 | jl .Lxorpart4 |
376 | vpxord 0x70(%rdx),%xmm10,%xmm9 |
377 | vmovdqu %xmm9,0x70(%rsi) |
378 | |
379 | # o0 = i0 ^ (x0 + s0), third block |
380 | vpaddd %ymm11,%ymm4,%ymm10 |
381 | cmp $0x90,%rcx |
382 | jl .Lxorpart4 |
383 | vpxord 0x80(%rdx),%xmm10,%xmm9 |
384 | vmovdqu %xmm9,0x80(%rsi) |
385 | vextracti128 $1,%ymm10,%xmm4 |
386 | # o1 = i1 ^ (x1 + s1), third block |
387 | vpaddd %ymm12,%ymm5,%ymm10 |
388 | cmp $0xa0,%rcx |
389 | jl .Lxorpart4 |
390 | vpxord 0x90(%rdx),%xmm10,%xmm9 |
391 | vmovdqu %xmm9,0x90(%rsi) |
392 | vextracti128 $1,%ymm10,%xmm5 |
393 | # o2 = i2 ^ (x2 + s2), third block |
394 | vpaddd %ymm13,%ymm6,%ymm10 |
395 | cmp $0xb0,%rcx |
396 | jl .Lxorpart4 |
397 | vpxord 0xa0(%rdx),%xmm10,%xmm9 |
398 | vmovdqu %xmm9,0xa0(%rsi) |
399 | vextracti128 $1,%ymm10,%xmm6 |
400 | # o3 = i3 ^ (x3 + s3), third block |
401 | vpaddd %ymm15,%ymm7,%ymm10 |
402 | cmp $0xc0,%rcx |
403 | jl .Lxorpart4 |
404 | vpxord 0xb0(%rdx),%xmm10,%xmm9 |
405 | vmovdqu %xmm9,0xb0(%rsi) |
406 | vextracti128 $1,%ymm10,%xmm7 |
407 | |
408 | # xor and write fourth block |
409 | vmovdqa %xmm4,%xmm10 |
410 | cmp $0xd0,%rcx |
411 | jl .Lxorpart4 |
412 | vpxord 0xc0(%rdx),%xmm10,%xmm9 |
413 | vmovdqu %xmm9,0xc0(%rsi) |
414 | |
415 | vmovdqa %xmm5,%xmm10 |
416 | cmp $0xe0,%rcx |
417 | jl .Lxorpart4 |
418 | vpxord 0xd0(%rdx),%xmm10,%xmm9 |
419 | vmovdqu %xmm9,0xd0(%rsi) |
420 | |
421 | vmovdqa %xmm6,%xmm10 |
422 | cmp $0xf0,%rcx |
423 | jl .Lxorpart4 |
424 | vpxord 0xe0(%rdx),%xmm10,%xmm9 |
425 | vmovdqu %xmm9,0xe0(%rsi) |
426 | |
427 | vmovdqa %xmm7,%xmm10 |
428 | cmp $0x100,%rcx |
429 | jl .Lxorpart4 |
430 | vpxord 0xf0(%rdx),%xmm10,%xmm9 |
431 | vmovdqu %xmm9,0xf0(%rsi) |
432 | |
433 | .Ldone4: |
434 | vzeroupper |
435 | RET |
436 | |
437 | .Lxorpart4: |
438 | # xor remaining bytes from partial register into output |
439 | mov %rcx,%rax |
440 | and $0xf,%rcx |
441 | jz .Ldone4 |
442 | mov %rax,%r9 |
443 | and $~0xf,%r9 |
444 | |
445 | mov $1,%rax |
446 | shld %cl,%rax,%rax |
447 | sub $1,%rax |
448 | kmovq %rax,%k1 |
449 | |
450 | vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} |
451 | vpxord %xmm10,%xmm1,%xmm1 |
452 | vmovdqu8 %xmm1,(%rsi,%r9){%k1} |
453 | |
454 | jmp .Ldone4 |
455 | |
456 | SYM_FUNC_END(chacha_4block_xor_avx512vl) |
457 | |
458 | SYM_FUNC_START(chacha_8block_xor_avx512vl) |
459 | # %rdi: Input state matrix, s |
460 | # %rsi: up to 8 data blocks output, o |
461 | # %rdx: up to 8 data blocks input, i |
462 | # %rcx: input/output length in bytes |
463 | # %r8d: nrounds |
464 | |
465 | # This function encrypts eight consecutive ChaCha blocks by loading |
466 | # the state matrix in AVX registers eight times. Compared to AVX2, this |
467 | # mostly benefits from the new rotate instructions in VL and the |
468 | # additional registers. |
469 | |
470 | vzeroupper |
471 | |
472 | # x0..15[0-7] = s[0..15] |
473 | vpbroadcastd 0x00(%rdi),%ymm0 |
474 | vpbroadcastd 0x04(%rdi),%ymm1 |
475 | vpbroadcastd 0x08(%rdi),%ymm2 |
476 | vpbroadcastd 0x0c(%rdi),%ymm3 |
477 | vpbroadcastd 0x10(%rdi),%ymm4 |
478 | vpbroadcastd 0x14(%rdi),%ymm5 |
479 | vpbroadcastd 0x18(%rdi),%ymm6 |
480 | vpbroadcastd 0x1c(%rdi),%ymm7 |
481 | vpbroadcastd 0x20(%rdi),%ymm8 |
482 | vpbroadcastd 0x24(%rdi),%ymm9 |
483 | vpbroadcastd 0x28(%rdi),%ymm10 |
484 | vpbroadcastd 0x2c(%rdi),%ymm11 |
485 | vpbroadcastd 0x30(%rdi),%ymm12 |
486 | vpbroadcastd 0x34(%rdi),%ymm13 |
487 | vpbroadcastd 0x38(%rdi),%ymm14 |
488 | vpbroadcastd 0x3c(%rdi),%ymm15 |
489 | |
490 | # x12 += counter values 0-3 |
491 | vpaddd CTR8BL(%rip),%ymm12,%ymm12 |
492 | |
493 | vmovdqa64 %ymm0,%ymm16 |
494 | vmovdqa64 %ymm1,%ymm17 |
495 | vmovdqa64 %ymm2,%ymm18 |
496 | vmovdqa64 %ymm3,%ymm19 |
497 | vmovdqa64 %ymm4,%ymm20 |
498 | vmovdqa64 %ymm5,%ymm21 |
499 | vmovdqa64 %ymm6,%ymm22 |
500 | vmovdqa64 %ymm7,%ymm23 |
501 | vmovdqa64 %ymm8,%ymm24 |
502 | vmovdqa64 %ymm9,%ymm25 |
503 | vmovdqa64 %ymm10,%ymm26 |
504 | vmovdqa64 %ymm11,%ymm27 |
505 | vmovdqa64 %ymm12,%ymm28 |
506 | vmovdqa64 %ymm13,%ymm29 |
507 | vmovdqa64 %ymm14,%ymm30 |
508 | vmovdqa64 %ymm15,%ymm31 |
509 | |
510 | .Ldoubleround8: |
511 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
512 | vpaddd %ymm0,%ymm4,%ymm0 |
513 | vpxord %ymm0,%ymm12,%ymm12 |
514 | vprold $16,%ymm12,%ymm12 |
515 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
516 | vpaddd %ymm1,%ymm5,%ymm1 |
517 | vpxord %ymm1,%ymm13,%ymm13 |
518 | vprold $16,%ymm13,%ymm13 |
519 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
520 | vpaddd %ymm2,%ymm6,%ymm2 |
521 | vpxord %ymm2,%ymm14,%ymm14 |
522 | vprold $16,%ymm14,%ymm14 |
523 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
524 | vpaddd %ymm3,%ymm7,%ymm3 |
525 | vpxord %ymm3,%ymm15,%ymm15 |
526 | vprold $16,%ymm15,%ymm15 |
527 | |
528 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
529 | vpaddd %ymm12,%ymm8,%ymm8 |
530 | vpxord %ymm8,%ymm4,%ymm4 |
531 | vprold $12,%ymm4,%ymm4 |
532 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
533 | vpaddd %ymm13,%ymm9,%ymm9 |
534 | vpxord %ymm9,%ymm5,%ymm5 |
535 | vprold $12,%ymm5,%ymm5 |
536 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
537 | vpaddd %ymm14,%ymm10,%ymm10 |
538 | vpxord %ymm10,%ymm6,%ymm6 |
539 | vprold $12,%ymm6,%ymm6 |
540 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
541 | vpaddd %ymm15,%ymm11,%ymm11 |
542 | vpxord %ymm11,%ymm7,%ymm7 |
543 | vprold $12,%ymm7,%ymm7 |
544 | |
545 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
546 | vpaddd %ymm0,%ymm4,%ymm0 |
547 | vpxord %ymm0,%ymm12,%ymm12 |
548 | vprold $8,%ymm12,%ymm12 |
549 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
550 | vpaddd %ymm1,%ymm5,%ymm1 |
551 | vpxord %ymm1,%ymm13,%ymm13 |
552 | vprold $8,%ymm13,%ymm13 |
553 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
554 | vpaddd %ymm2,%ymm6,%ymm2 |
555 | vpxord %ymm2,%ymm14,%ymm14 |
556 | vprold $8,%ymm14,%ymm14 |
557 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
558 | vpaddd %ymm3,%ymm7,%ymm3 |
559 | vpxord %ymm3,%ymm15,%ymm15 |
560 | vprold $8,%ymm15,%ymm15 |
561 | |
562 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
563 | vpaddd %ymm12,%ymm8,%ymm8 |
564 | vpxord %ymm8,%ymm4,%ymm4 |
565 | vprold $7,%ymm4,%ymm4 |
566 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
567 | vpaddd %ymm13,%ymm9,%ymm9 |
568 | vpxord %ymm9,%ymm5,%ymm5 |
569 | vprold $7,%ymm5,%ymm5 |
570 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
571 | vpaddd %ymm14,%ymm10,%ymm10 |
572 | vpxord %ymm10,%ymm6,%ymm6 |
573 | vprold $7,%ymm6,%ymm6 |
574 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
575 | vpaddd %ymm15,%ymm11,%ymm11 |
576 | vpxord %ymm11,%ymm7,%ymm7 |
577 | vprold $7,%ymm7,%ymm7 |
578 | |
579 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
580 | vpaddd %ymm0,%ymm5,%ymm0 |
581 | vpxord %ymm0,%ymm15,%ymm15 |
582 | vprold $16,%ymm15,%ymm15 |
583 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16) |
584 | vpaddd %ymm1,%ymm6,%ymm1 |
585 | vpxord %ymm1,%ymm12,%ymm12 |
586 | vprold $16,%ymm12,%ymm12 |
587 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
588 | vpaddd %ymm2,%ymm7,%ymm2 |
589 | vpxord %ymm2,%ymm13,%ymm13 |
590 | vprold $16,%ymm13,%ymm13 |
591 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
592 | vpaddd %ymm3,%ymm4,%ymm3 |
593 | vpxord %ymm3,%ymm14,%ymm14 |
594 | vprold $16,%ymm14,%ymm14 |
595 | |
596 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
597 | vpaddd %ymm15,%ymm10,%ymm10 |
598 | vpxord %ymm10,%ymm5,%ymm5 |
599 | vprold $12,%ymm5,%ymm5 |
600 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
601 | vpaddd %ymm12,%ymm11,%ymm11 |
602 | vpxord %ymm11,%ymm6,%ymm6 |
603 | vprold $12,%ymm6,%ymm6 |
604 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
605 | vpaddd %ymm13,%ymm8,%ymm8 |
606 | vpxord %ymm8,%ymm7,%ymm7 |
607 | vprold $12,%ymm7,%ymm7 |
608 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
609 | vpaddd %ymm14,%ymm9,%ymm9 |
610 | vpxord %ymm9,%ymm4,%ymm4 |
611 | vprold $12,%ymm4,%ymm4 |
612 | |
613 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
614 | vpaddd %ymm0,%ymm5,%ymm0 |
615 | vpxord %ymm0,%ymm15,%ymm15 |
616 | vprold $8,%ymm15,%ymm15 |
617 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
618 | vpaddd %ymm1,%ymm6,%ymm1 |
619 | vpxord %ymm1,%ymm12,%ymm12 |
620 | vprold $8,%ymm12,%ymm12 |
621 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
622 | vpaddd %ymm2,%ymm7,%ymm2 |
623 | vpxord %ymm2,%ymm13,%ymm13 |
624 | vprold $8,%ymm13,%ymm13 |
625 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
626 | vpaddd %ymm3,%ymm4,%ymm3 |
627 | vpxord %ymm3,%ymm14,%ymm14 |
628 | vprold $8,%ymm14,%ymm14 |
629 | |
630 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
631 | vpaddd %ymm15,%ymm10,%ymm10 |
632 | vpxord %ymm10,%ymm5,%ymm5 |
633 | vprold $7,%ymm5,%ymm5 |
634 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
635 | vpaddd %ymm12,%ymm11,%ymm11 |
636 | vpxord %ymm11,%ymm6,%ymm6 |
637 | vprold $7,%ymm6,%ymm6 |
638 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
639 | vpaddd %ymm13,%ymm8,%ymm8 |
640 | vpxord %ymm8,%ymm7,%ymm7 |
641 | vprold $7,%ymm7,%ymm7 |
642 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
643 | vpaddd %ymm14,%ymm9,%ymm9 |
644 | vpxord %ymm9,%ymm4,%ymm4 |
645 | vprold $7,%ymm4,%ymm4 |
646 | |
647 | sub $2,%r8d |
648 | jnz .Ldoubleround8 |
649 | |
650 | # x0..15[0-3] += s[0..15] |
651 | vpaddd %ymm16,%ymm0,%ymm0 |
652 | vpaddd %ymm17,%ymm1,%ymm1 |
653 | vpaddd %ymm18,%ymm2,%ymm2 |
654 | vpaddd %ymm19,%ymm3,%ymm3 |
655 | vpaddd %ymm20,%ymm4,%ymm4 |
656 | vpaddd %ymm21,%ymm5,%ymm5 |
657 | vpaddd %ymm22,%ymm6,%ymm6 |
658 | vpaddd %ymm23,%ymm7,%ymm7 |
659 | vpaddd %ymm24,%ymm8,%ymm8 |
660 | vpaddd %ymm25,%ymm9,%ymm9 |
661 | vpaddd %ymm26,%ymm10,%ymm10 |
662 | vpaddd %ymm27,%ymm11,%ymm11 |
663 | vpaddd %ymm28,%ymm12,%ymm12 |
664 | vpaddd %ymm29,%ymm13,%ymm13 |
665 | vpaddd %ymm30,%ymm14,%ymm14 |
666 | vpaddd %ymm31,%ymm15,%ymm15 |
667 | |
668 | # interleave 32-bit words in state n, n+1 |
669 | vpunpckldq %ymm1,%ymm0,%ymm16 |
670 | vpunpckhdq %ymm1,%ymm0,%ymm17 |
671 | vpunpckldq %ymm3,%ymm2,%ymm18 |
672 | vpunpckhdq %ymm3,%ymm2,%ymm19 |
673 | vpunpckldq %ymm5,%ymm4,%ymm20 |
674 | vpunpckhdq %ymm5,%ymm4,%ymm21 |
675 | vpunpckldq %ymm7,%ymm6,%ymm22 |
676 | vpunpckhdq %ymm7,%ymm6,%ymm23 |
677 | vpunpckldq %ymm9,%ymm8,%ymm24 |
678 | vpunpckhdq %ymm9,%ymm8,%ymm25 |
679 | vpunpckldq %ymm11,%ymm10,%ymm26 |
680 | vpunpckhdq %ymm11,%ymm10,%ymm27 |
681 | vpunpckldq %ymm13,%ymm12,%ymm28 |
682 | vpunpckhdq %ymm13,%ymm12,%ymm29 |
683 | vpunpckldq %ymm15,%ymm14,%ymm30 |
684 | vpunpckhdq %ymm15,%ymm14,%ymm31 |
685 | |
686 | # interleave 64-bit words in state n, n+2 |
687 | vpunpcklqdq %ymm18,%ymm16,%ymm0 |
688 | vpunpcklqdq %ymm19,%ymm17,%ymm1 |
689 | vpunpckhqdq %ymm18,%ymm16,%ymm2 |
690 | vpunpckhqdq %ymm19,%ymm17,%ymm3 |
691 | vpunpcklqdq %ymm22,%ymm20,%ymm4 |
692 | vpunpcklqdq %ymm23,%ymm21,%ymm5 |
693 | vpunpckhqdq %ymm22,%ymm20,%ymm6 |
694 | vpunpckhqdq %ymm23,%ymm21,%ymm7 |
695 | vpunpcklqdq %ymm26,%ymm24,%ymm8 |
696 | vpunpcklqdq %ymm27,%ymm25,%ymm9 |
697 | vpunpckhqdq %ymm26,%ymm24,%ymm10 |
698 | vpunpckhqdq %ymm27,%ymm25,%ymm11 |
699 | vpunpcklqdq %ymm30,%ymm28,%ymm12 |
700 | vpunpcklqdq %ymm31,%ymm29,%ymm13 |
701 | vpunpckhqdq %ymm30,%ymm28,%ymm14 |
702 | vpunpckhqdq %ymm31,%ymm29,%ymm15 |
703 | |
704 | # interleave 128-bit words in state n, n+4 |
705 | # xor/write first four blocks |
706 | vmovdqa64 %ymm0,%ymm16 |
707 | vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 |
708 | cmp $0x0020,%rcx |
709 | jl .Lxorpart8 |
710 | vpxord 0x0000(%rdx),%ymm0,%ymm0 |
711 | vmovdqu64 %ymm0,0x0000(%rsi) |
712 | vmovdqa64 %ymm16,%ymm0 |
713 | vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 |
714 | |
715 | vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 |
716 | cmp $0x0040,%rcx |
717 | jl .Lxorpart8 |
718 | vpxord 0x0020(%rdx),%ymm0,%ymm0 |
719 | vmovdqu64 %ymm0,0x0020(%rsi) |
720 | vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 |
721 | |
722 | vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 |
723 | cmp $0x0060,%rcx |
724 | jl .Lxorpart8 |
725 | vpxord 0x0040(%rdx),%ymm0,%ymm0 |
726 | vmovdqu64 %ymm0,0x0040(%rsi) |
727 | vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 |
728 | |
729 | vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 |
730 | cmp $0x0080,%rcx |
731 | jl .Lxorpart8 |
732 | vpxord 0x0060(%rdx),%ymm0,%ymm0 |
733 | vmovdqu64 %ymm0,0x0060(%rsi) |
734 | vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 |
735 | |
736 | vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 |
737 | cmp $0x00a0,%rcx |
738 | jl .Lxorpart8 |
739 | vpxord 0x0080(%rdx),%ymm0,%ymm0 |
740 | vmovdqu64 %ymm0,0x0080(%rsi) |
741 | vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 |
742 | |
743 | vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 |
744 | cmp $0x00c0,%rcx |
745 | jl .Lxorpart8 |
746 | vpxord 0x00a0(%rdx),%ymm0,%ymm0 |
747 | vmovdqu64 %ymm0,0x00a0(%rsi) |
748 | vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 |
749 | |
750 | vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 |
751 | cmp $0x00e0,%rcx |
752 | jl .Lxorpart8 |
753 | vpxord 0x00c0(%rdx),%ymm0,%ymm0 |
754 | vmovdqu64 %ymm0,0x00c0(%rsi) |
755 | vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 |
756 | |
757 | vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 |
758 | cmp $0x0100,%rcx |
759 | jl .Lxorpart8 |
760 | vpxord 0x00e0(%rdx),%ymm0,%ymm0 |
761 | vmovdqu64 %ymm0,0x00e0(%rsi) |
762 | vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 |
763 | |
764 | # xor remaining blocks, write to output |
765 | vmovdqa64 %ymm4,%ymm0 |
766 | cmp $0x0120,%rcx |
767 | jl .Lxorpart8 |
768 | vpxord 0x0100(%rdx),%ymm0,%ymm0 |
769 | vmovdqu64 %ymm0,0x0100(%rsi) |
770 | |
771 | vmovdqa64 %ymm12,%ymm0 |
772 | cmp $0x0140,%rcx |
773 | jl .Lxorpart8 |
774 | vpxord 0x0120(%rdx),%ymm0,%ymm0 |
775 | vmovdqu64 %ymm0,0x0120(%rsi) |
776 | |
777 | vmovdqa64 %ymm6,%ymm0 |
778 | cmp $0x0160,%rcx |
779 | jl .Lxorpart8 |
780 | vpxord 0x0140(%rdx),%ymm0,%ymm0 |
781 | vmovdqu64 %ymm0,0x0140(%rsi) |
782 | |
783 | vmovdqa64 %ymm14,%ymm0 |
784 | cmp $0x0180,%rcx |
785 | jl .Lxorpart8 |
786 | vpxord 0x0160(%rdx),%ymm0,%ymm0 |
787 | vmovdqu64 %ymm0,0x0160(%rsi) |
788 | |
789 | vmovdqa64 %ymm5,%ymm0 |
790 | cmp $0x01a0,%rcx |
791 | jl .Lxorpart8 |
792 | vpxord 0x0180(%rdx),%ymm0,%ymm0 |
793 | vmovdqu64 %ymm0,0x0180(%rsi) |
794 | |
795 | vmovdqa64 %ymm13,%ymm0 |
796 | cmp $0x01c0,%rcx |
797 | jl .Lxorpart8 |
798 | vpxord 0x01a0(%rdx),%ymm0,%ymm0 |
799 | vmovdqu64 %ymm0,0x01a0(%rsi) |
800 | |
801 | vmovdqa64 %ymm7,%ymm0 |
802 | cmp $0x01e0,%rcx |
803 | jl .Lxorpart8 |
804 | vpxord 0x01c0(%rdx),%ymm0,%ymm0 |
805 | vmovdqu64 %ymm0,0x01c0(%rsi) |
806 | |
807 | vmovdqa64 %ymm15,%ymm0 |
808 | cmp $0x0200,%rcx |
809 | jl .Lxorpart8 |
810 | vpxord 0x01e0(%rdx),%ymm0,%ymm0 |
811 | vmovdqu64 %ymm0,0x01e0(%rsi) |
812 | |
813 | .Ldone8: |
814 | vzeroupper |
815 | RET |
816 | |
817 | .Lxorpart8: |
818 | # xor remaining bytes from partial register into output |
819 | mov %rcx,%rax |
820 | and $0x1f,%rcx |
821 | jz .Ldone8 |
822 | mov %rax,%r9 |
823 | and $~0x1f,%r9 |
824 | |
825 | mov $1,%rax |
826 | shld %cl,%rax,%rax |
827 | sub $1,%rax |
828 | kmovq %rax,%k1 |
829 | |
830 | vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} |
831 | vpxord %ymm0,%ymm1,%ymm1 |
832 | vmovdqu8 %ymm1,(%rsi,%r9){%k1} |
833 | |
834 | jmp .Ldone8 |
835 | |
836 | SYM_FUNC_END(chacha_8block_xor_avx512vl) |
837 | |