1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#
3# Accelerated chacha20 implementation for ppc64le.
4#
5# Copyright 2023- IBM Corp. All rights reserved
6#
7#===================================================================================
8# Written by Danny Tsen <dtsen@us.ibm.com>
9#
10# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
11# size_t len, int nrounds);
12#
13# do rounds, 8 quarter rounds
14# 1. a += b; d ^= a; d <<<= 16;
15# 2. c += d; b ^= c; b <<<= 12;
16# 3. a += b; d ^= a; d <<<= 8;
17# 4. c += d; b ^= c; b <<<= 7
18#
19# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
20# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
21# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
22# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
23#
24# 4 blocks (a b c d)
25#
26# a0 b0 c0 d0
27# a1 b1 c1 d1
28# ...
29# a4 b4 c4 d4
30# ...
31# a8 b8 c8 d8
32# ...
33# a12 b12 c12 d12
34# a13 ...
35# a14 ...
36# a15 b15 c15 d15
37#
38# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
39# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
40#
41
42#include <asm/ppc_asm.h>
43#include <asm/asm-offsets.h>
44#include <asm/asm-compat.h>
45#include <linux/linkage.h>
46
47.machine "any"
48.text
49
50.macro SAVE_GPR GPR OFFSET FRAME
51 std \GPR,\OFFSET(\FRAME)
52.endm
53
54.macro SAVE_VRS VRS OFFSET FRAME
55 li 16, \OFFSET
56 stvx \VRS, 16, \FRAME
57.endm
58
59.macro SAVE_VSX VSX OFFSET FRAME
60 li 16, \OFFSET
61 stxvx \VSX, 16, \FRAME
62.endm
63
64.macro RESTORE_GPR GPR OFFSET FRAME
65 ld \GPR,\OFFSET(\FRAME)
66.endm
67
68.macro RESTORE_VRS VRS OFFSET FRAME
69 li 16, \OFFSET
70 lvx \VRS, 16, \FRAME
71.endm
72
73.macro RESTORE_VSX VSX OFFSET FRAME
74 li 16, \OFFSET
75 lxvx \VSX, 16, \FRAME
76.endm
77
78.macro SAVE_REGS
79 mflr 0
80 std 0, 16(1)
81 stdu 1,-752(1)
82
83 SAVE_GPR 14, 112, 1
84 SAVE_GPR 15, 120, 1
85 SAVE_GPR 16, 128, 1
86 SAVE_GPR 17, 136, 1
87 SAVE_GPR 18, 144, 1
88 SAVE_GPR 19, 152, 1
89 SAVE_GPR 20, 160, 1
90 SAVE_GPR 21, 168, 1
91 SAVE_GPR 22, 176, 1
92 SAVE_GPR 23, 184, 1
93 SAVE_GPR 24, 192, 1
94 SAVE_GPR 25, 200, 1
95 SAVE_GPR 26, 208, 1
96 SAVE_GPR 27, 216, 1
97 SAVE_GPR 28, 224, 1
98 SAVE_GPR 29, 232, 1
99 SAVE_GPR 30, 240, 1
100 SAVE_GPR 31, 248, 1
101
102 addi 9, 1, 256
103 SAVE_VRS 20, 0, 9
104 SAVE_VRS 21, 16, 9
105 SAVE_VRS 22, 32, 9
106 SAVE_VRS 23, 48, 9
107 SAVE_VRS 24, 64, 9
108 SAVE_VRS 25, 80, 9
109 SAVE_VRS 26, 96, 9
110 SAVE_VRS 27, 112, 9
111 SAVE_VRS 28, 128, 9
112 SAVE_VRS 29, 144, 9
113 SAVE_VRS 30, 160, 9
114 SAVE_VRS 31, 176, 9
115
116 SAVE_VSX 14, 192, 9
117 SAVE_VSX 15, 208, 9
118 SAVE_VSX 16, 224, 9
119 SAVE_VSX 17, 240, 9
120 SAVE_VSX 18, 256, 9
121 SAVE_VSX 19, 272, 9
122 SAVE_VSX 20, 288, 9
123 SAVE_VSX 21, 304, 9
124 SAVE_VSX 22, 320, 9
125 SAVE_VSX 23, 336, 9
126 SAVE_VSX 24, 352, 9
127 SAVE_VSX 25, 368, 9
128 SAVE_VSX 26, 384, 9
129 SAVE_VSX 27, 400, 9
130 SAVE_VSX 28, 416, 9
131 SAVE_VSX 29, 432, 9
132 SAVE_VSX 30, 448, 9
133 SAVE_VSX 31, 464, 9
134.endm # SAVE_REGS
135
136.macro RESTORE_REGS
137 addi 9, 1, 256
138 RESTORE_VRS 20, 0, 9
139 RESTORE_VRS 21, 16, 9
140 RESTORE_VRS 22, 32, 9
141 RESTORE_VRS 23, 48, 9
142 RESTORE_VRS 24, 64, 9
143 RESTORE_VRS 25, 80, 9
144 RESTORE_VRS 26, 96, 9
145 RESTORE_VRS 27, 112, 9
146 RESTORE_VRS 28, 128, 9
147 RESTORE_VRS 29, 144, 9
148 RESTORE_VRS 30, 160, 9
149 RESTORE_VRS 31, 176, 9
150
151 RESTORE_VSX 14, 192, 9
152 RESTORE_VSX 15, 208, 9
153 RESTORE_VSX 16, 224, 9
154 RESTORE_VSX 17, 240, 9
155 RESTORE_VSX 18, 256, 9
156 RESTORE_VSX 19, 272, 9
157 RESTORE_VSX 20, 288, 9
158 RESTORE_VSX 21, 304, 9
159 RESTORE_VSX 22, 320, 9
160 RESTORE_VSX 23, 336, 9
161 RESTORE_VSX 24, 352, 9
162 RESTORE_VSX 25, 368, 9
163 RESTORE_VSX 26, 384, 9
164 RESTORE_VSX 27, 400, 9
165 RESTORE_VSX 28, 416, 9
166 RESTORE_VSX 29, 432, 9
167 RESTORE_VSX 30, 448, 9
168 RESTORE_VSX 31, 464, 9
169
170 RESTORE_GPR 14, 112, 1
171 RESTORE_GPR 15, 120, 1
172 RESTORE_GPR 16, 128, 1
173 RESTORE_GPR 17, 136, 1
174 RESTORE_GPR 18, 144, 1
175 RESTORE_GPR 19, 152, 1
176 RESTORE_GPR 20, 160, 1
177 RESTORE_GPR 21, 168, 1
178 RESTORE_GPR 22, 176, 1
179 RESTORE_GPR 23, 184, 1
180 RESTORE_GPR 24, 192, 1
181 RESTORE_GPR 25, 200, 1
182 RESTORE_GPR 26, 208, 1
183 RESTORE_GPR 27, 216, 1
184 RESTORE_GPR 28, 224, 1
185 RESTORE_GPR 29, 232, 1
186 RESTORE_GPR 30, 240, 1
187 RESTORE_GPR 31, 248, 1
188
189 addi 1, 1, 752
190 ld 0, 16(1)
191 mtlr 0
192.endm # RESTORE_REGS
193
194.macro QT_loop_8x
195 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
196 xxlor 0, 32+25, 32+25
197 xxlor 32+25, 20, 20
198 vadduwm 0, 0, 4
199 vadduwm 1, 1, 5
200 vadduwm 2, 2, 6
201 vadduwm 3, 3, 7
202 vadduwm 16, 16, 20
203 vadduwm 17, 17, 21
204 vadduwm 18, 18, 22
205 vadduwm 19, 19, 23
206
207 vpermxor 12, 12, 0, 25
208 vpermxor 13, 13, 1, 25
209 vpermxor 14, 14, 2, 25
210 vpermxor 15, 15, 3, 25
211 vpermxor 28, 28, 16, 25
212 vpermxor 29, 29, 17, 25
213 vpermxor 30, 30, 18, 25
214 vpermxor 31, 31, 19, 25
215 xxlor 32+25, 0, 0
216 vadduwm 8, 8, 12
217 vadduwm 9, 9, 13
218 vadduwm 10, 10, 14
219 vadduwm 11, 11, 15
220 vadduwm 24, 24, 28
221 vadduwm 25, 25, 29
222 vadduwm 26, 26, 30
223 vadduwm 27, 27, 31
224 vxor 4, 4, 8
225 vxor 5, 5, 9
226 vxor 6, 6, 10
227 vxor 7, 7, 11
228 vxor 20, 20, 24
229 vxor 21, 21, 25
230 vxor 22, 22, 26
231 vxor 23, 23, 27
232
233 xxlor 0, 32+25, 32+25
234 xxlor 32+25, 21, 21
235 vrlw 4, 4, 25 #
236 vrlw 5, 5, 25
237 vrlw 6, 6, 25
238 vrlw 7, 7, 25
239 vrlw 20, 20, 25 #
240 vrlw 21, 21, 25
241 vrlw 22, 22, 25
242 vrlw 23, 23, 25
243 xxlor 32+25, 0, 0
244 vadduwm 0, 0, 4
245 vadduwm 1, 1, 5
246 vadduwm 2, 2, 6
247 vadduwm 3, 3, 7
248 vadduwm 16, 16, 20
249 vadduwm 17, 17, 21
250 vadduwm 18, 18, 22
251 vadduwm 19, 19, 23
252
253 xxlor 0, 32+25, 32+25
254 xxlor 32+25, 22, 22
255 vpermxor 12, 12, 0, 25
256 vpermxor 13, 13, 1, 25
257 vpermxor 14, 14, 2, 25
258 vpermxor 15, 15, 3, 25
259 vpermxor 28, 28, 16, 25
260 vpermxor 29, 29, 17, 25
261 vpermxor 30, 30, 18, 25
262 vpermxor 31, 31, 19, 25
263 xxlor 32+25, 0, 0
264 vadduwm 8, 8, 12
265 vadduwm 9, 9, 13
266 vadduwm 10, 10, 14
267 vadduwm 11, 11, 15
268 vadduwm 24, 24, 28
269 vadduwm 25, 25, 29
270 vadduwm 26, 26, 30
271 vadduwm 27, 27, 31
272 xxlor 0, 32+28, 32+28
273 xxlor 32+28, 23, 23
274 vxor 4, 4, 8
275 vxor 5, 5, 9
276 vxor 6, 6, 10
277 vxor 7, 7, 11
278 vxor 20, 20, 24
279 vxor 21, 21, 25
280 vxor 22, 22, 26
281 vxor 23, 23, 27
282 vrlw 4, 4, 28 #
283 vrlw 5, 5, 28
284 vrlw 6, 6, 28
285 vrlw 7, 7, 28
286 vrlw 20, 20, 28 #
287 vrlw 21, 21, 28
288 vrlw 22, 22, 28
289 vrlw 23, 23, 28
290 xxlor 32+28, 0, 0
291
292 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
293 xxlor 0, 32+25, 32+25
294 xxlor 32+25, 20, 20
295 vadduwm 0, 0, 5
296 vadduwm 1, 1, 6
297 vadduwm 2, 2, 7
298 vadduwm 3, 3, 4
299 vadduwm 16, 16, 21
300 vadduwm 17, 17, 22
301 vadduwm 18, 18, 23
302 vadduwm 19, 19, 20
303
304 vpermxor 15, 15, 0, 25
305 vpermxor 12, 12, 1, 25
306 vpermxor 13, 13, 2, 25
307 vpermxor 14, 14, 3, 25
308 vpermxor 31, 31, 16, 25
309 vpermxor 28, 28, 17, 25
310 vpermxor 29, 29, 18, 25
311 vpermxor 30, 30, 19, 25
312
313 xxlor 32+25, 0, 0
314 vadduwm 10, 10, 15
315 vadduwm 11, 11, 12
316 vadduwm 8, 8, 13
317 vadduwm 9, 9, 14
318 vadduwm 26, 26, 31
319 vadduwm 27, 27, 28
320 vadduwm 24, 24, 29
321 vadduwm 25, 25, 30
322 vxor 5, 5, 10
323 vxor 6, 6, 11
324 vxor 7, 7, 8
325 vxor 4, 4, 9
326 vxor 21, 21, 26
327 vxor 22, 22, 27
328 vxor 23, 23, 24
329 vxor 20, 20, 25
330
331 xxlor 0, 32+25, 32+25
332 xxlor 32+25, 21, 21
333 vrlw 5, 5, 25
334 vrlw 6, 6, 25
335 vrlw 7, 7, 25
336 vrlw 4, 4, 25
337 vrlw 21, 21, 25
338 vrlw 22, 22, 25
339 vrlw 23, 23, 25
340 vrlw 20, 20, 25
341 xxlor 32+25, 0, 0
342
343 vadduwm 0, 0, 5
344 vadduwm 1, 1, 6
345 vadduwm 2, 2, 7
346 vadduwm 3, 3, 4
347 vadduwm 16, 16, 21
348 vadduwm 17, 17, 22
349 vadduwm 18, 18, 23
350 vadduwm 19, 19, 20
351
352 xxlor 0, 32+25, 32+25
353 xxlor 32+25, 22, 22
354 vpermxor 15, 15, 0, 25
355 vpermxor 12, 12, 1, 25
356 vpermxor 13, 13, 2, 25
357 vpermxor 14, 14, 3, 25
358 vpermxor 31, 31, 16, 25
359 vpermxor 28, 28, 17, 25
360 vpermxor 29, 29, 18, 25
361 vpermxor 30, 30, 19, 25
362 xxlor 32+25, 0, 0
363
364 vadduwm 10, 10, 15
365 vadduwm 11, 11, 12
366 vadduwm 8, 8, 13
367 vadduwm 9, 9, 14
368 vadduwm 26, 26, 31
369 vadduwm 27, 27, 28
370 vadduwm 24, 24, 29
371 vadduwm 25, 25, 30
372
373 xxlor 0, 32+28, 32+28
374 xxlor 32+28, 23, 23
375 vxor 5, 5, 10
376 vxor 6, 6, 11
377 vxor 7, 7, 8
378 vxor 4, 4, 9
379 vxor 21, 21, 26
380 vxor 22, 22, 27
381 vxor 23, 23, 24
382 vxor 20, 20, 25
383 vrlw 5, 5, 28
384 vrlw 6, 6, 28
385 vrlw 7, 7, 28
386 vrlw 4, 4, 28
387 vrlw 21, 21, 28
388 vrlw 22, 22, 28
389 vrlw 23, 23, 28
390 vrlw 20, 20, 28
391 xxlor 32+28, 0, 0
392.endm
393
394.macro QT_loop_4x
395 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
396 vadduwm 0, 0, 4
397 vadduwm 1, 1, 5
398 vadduwm 2, 2, 6
399 vadduwm 3, 3, 7
400 vpermxor 12, 12, 0, 20
401 vpermxor 13, 13, 1, 20
402 vpermxor 14, 14, 2, 20
403 vpermxor 15, 15, 3, 20
404 vadduwm 8, 8, 12
405 vadduwm 9, 9, 13
406 vadduwm 10, 10, 14
407 vadduwm 11, 11, 15
408 vxor 4, 4, 8
409 vxor 5, 5, 9
410 vxor 6, 6, 10
411 vxor 7, 7, 11
412 vrlw 4, 4, 21
413 vrlw 5, 5, 21
414 vrlw 6, 6, 21
415 vrlw 7, 7, 21
416 vadduwm 0, 0, 4
417 vadduwm 1, 1, 5
418 vadduwm 2, 2, 6
419 vadduwm 3, 3, 7
420 vpermxor 12, 12, 0, 22
421 vpermxor 13, 13, 1, 22
422 vpermxor 14, 14, 2, 22
423 vpermxor 15, 15, 3, 22
424 vadduwm 8, 8, 12
425 vadduwm 9, 9, 13
426 vadduwm 10, 10, 14
427 vadduwm 11, 11, 15
428 vxor 4, 4, 8
429 vxor 5, 5, 9
430 vxor 6, 6, 10
431 vxor 7, 7, 11
432 vrlw 4, 4, 23
433 vrlw 5, 5, 23
434 vrlw 6, 6, 23
435 vrlw 7, 7, 23
436
437 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
438 vadduwm 0, 0, 5
439 vadduwm 1, 1, 6
440 vadduwm 2, 2, 7
441 vadduwm 3, 3, 4
442 vpermxor 15, 15, 0, 20
443 vpermxor 12, 12, 1, 20
444 vpermxor 13, 13, 2, 20
445 vpermxor 14, 14, 3, 20
446 vadduwm 10, 10, 15
447 vadduwm 11, 11, 12
448 vadduwm 8, 8, 13
449 vadduwm 9, 9, 14
450 vxor 5, 5, 10
451 vxor 6, 6, 11
452 vxor 7, 7, 8
453 vxor 4, 4, 9
454 vrlw 5, 5, 21
455 vrlw 6, 6, 21
456 vrlw 7, 7, 21
457 vrlw 4, 4, 21
458 vadduwm 0, 0, 5
459 vadduwm 1, 1, 6
460 vadduwm 2, 2, 7
461 vadduwm 3, 3, 4
462 vpermxor 15, 15, 0, 22
463 vpermxor 12, 12, 1, 22
464 vpermxor 13, 13, 2, 22
465 vpermxor 14, 14, 3, 22
466 vadduwm 10, 10, 15
467 vadduwm 11, 11, 12
468 vadduwm 8, 8, 13
469 vadduwm 9, 9, 14
470 vxor 5, 5, 10
471 vxor 6, 6, 11
472 vxor 7, 7, 8
473 vxor 4, 4, 9
474 vrlw 5, 5, 23
475 vrlw 6, 6, 23
476 vrlw 7, 7, 23
477 vrlw 4, 4, 23
478.endm
479
480# Transpose
481.macro TP_4x a0 a1 a2 a3
482 xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
483 xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
484 xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
485 xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
486 xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
487 xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
488 xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
489 xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
490.endm
491
492# key stream = working state + state
493.macro Add_state S
494 vadduwm \S+0, \S+0, 16-\S
495 vadduwm \S+4, \S+4, 17-\S
496 vadduwm \S+8, \S+8, 18-\S
497 vadduwm \S+12, \S+12, 19-\S
498
499 vadduwm \S+1, \S+1, 16-\S
500 vadduwm \S+5, \S+5, 17-\S
501 vadduwm \S+9, \S+9, 18-\S
502 vadduwm \S+13, \S+13, 19-\S
503
504 vadduwm \S+2, \S+2, 16-\S
505 vadduwm \S+6, \S+6, 17-\S
506 vadduwm \S+10, \S+10, 18-\S
507 vadduwm \S+14, \S+14, 19-\S
508
509 vadduwm \S+3, \S+3, 16-\S
510 vadduwm \S+7, \S+7, 17-\S
511 vadduwm \S+11, \S+11, 18-\S
512 vadduwm \S+15, \S+15, 19-\S
513.endm
514
515#
516# write 256 bytes
517#
518.macro Write_256 S
519 add 9, 14, 5
520 add 16, 14, 4
521 lxvw4x 0, 0, 9
522 lxvw4x 1, 17, 9
523 lxvw4x 2, 18, 9
524 lxvw4x 3, 19, 9
525 lxvw4x 4, 20, 9
526 lxvw4x 5, 21, 9
527 lxvw4x 6, 22, 9
528 lxvw4x 7, 23, 9
529 lxvw4x 8, 24, 9
530 lxvw4x 9, 25, 9
531 lxvw4x 10, 26, 9
532 lxvw4x 11, 27, 9
533 lxvw4x 12, 28, 9
534 lxvw4x 13, 29, 9
535 lxvw4x 14, 30, 9
536 lxvw4x 15, 31, 9
537
538 xxlxor \S+32, \S+32, 0
539 xxlxor \S+36, \S+36, 1
540 xxlxor \S+40, \S+40, 2
541 xxlxor \S+44, \S+44, 3
542 xxlxor \S+33, \S+33, 4
543 xxlxor \S+37, \S+37, 5
544 xxlxor \S+41, \S+41, 6
545 xxlxor \S+45, \S+45, 7
546 xxlxor \S+34, \S+34, 8
547 xxlxor \S+38, \S+38, 9
548 xxlxor \S+42, \S+42, 10
549 xxlxor \S+46, \S+46, 11
550 xxlxor \S+35, \S+35, 12
551 xxlxor \S+39, \S+39, 13
552 xxlxor \S+43, \S+43, 14
553 xxlxor \S+47, \S+47, 15
554
555 stxvw4x \S+32, 0, 16
556 stxvw4x \S+36, 17, 16
557 stxvw4x \S+40, 18, 16
558 stxvw4x \S+44, 19, 16
559
560 stxvw4x \S+33, 20, 16
561 stxvw4x \S+37, 21, 16
562 stxvw4x \S+41, 22, 16
563 stxvw4x \S+45, 23, 16
564
565 stxvw4x \S+34, 24, 16
566 stxvw4x \S+38, 25, 16
567 stxvw4x \S+42, 26, 16
568 stxvw4x \S+46, 27, 16
569
570 stxvw4x \S+35, 28, 16
571 stxvw4x \S+39, 29, 16
572 stxvw4x \S+43, 30, 16
573 stxvw4x \S+47, 31, 16
574
575.endm
576
577#
578# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
579#
580SYM_FUNC_START(chacha_p10le_8x)
581.align 5
582 cmpdi 6, 0
583 ble Out_no_chacha
584
585 SAVE_REGS
586
587 # r17 - r31 mainly for Write_256 macro.
588 li 17, 16
589 li 18, 32
590 li 19, 48
591 li 20, 64
592 li 21, 80
593 li 22, 96
594 li 23, 112
595 li 24, 128
596 li 25, 144
597 li 26, 160
598 li 27, 176
599 li 28, 192
600 li 29, 208
601 li 30, 224
602 li 31, 240
603
604 mr 15, 6 # len
605 li 14, 0 # offset to inp and outp
606
607 lxvw4x 48, 0, 3 # vr16, constants
608 lxvw4x 49, 17, 3 # vr17, key 1
609 lxvw4x 50, 18, 3 # vr18, key 2
610 lxvw4x 51, 19, 3 # vr19, counter, nonce
611
612 # create (0, 1, 2, 3) counters
613 vspltisw 0, 0
614 vspltisw 1, 1
615 vspltisw 2, 2
616 vspltisw 3, 3
617 vmrghw 4, 0, 1
618 vmrglw 5, 2, 3
619 vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
620
621 vspltisw 21, 12
622 vspltisw 23, 7
623
624 addis 11, 2, permx@toc@ha
625 addi 11, 11, permx@toc@l
626 lxvw4x 32+20, 0, 11
627 lxvw4x 32+22, 17, 11
628
629 sradi 8, 7, 1
630
631 mtctr 8
632
633 # save constants to vsx
634 xxlor 16, 48, 48
635 xxlor 17, 49, 49
636 xxlor 18, 50, 50
637 xxlor 19, 51, 51
638
639 vspltisw 25, 4
640 vspltisw 26, 8
641
642 xxlor 25, 32+26, 32+26
643 xxlor 24, 32+25, 32+25
644
645 vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
646 xxlor 30, 32+30, 32+30
647 xxlor 31, 32+31, 32+31
648
649 xxlor 20, 32+20, 32+20
650 xxlor 21, 32+21, 32+21
651 xxlor 22, 32+22, 32+22
652 xxlor 23, 32+23, 32+23
653
654 cmpdi 6, 512
655 blt Loop_last
656
657Loop_8x:
658 xxspltw 32+0, 16, 0
659 xxspltw 32+1, 16, 1
660 xxspltw 32+2, 16, 2
661 xxspltw 32+3, 16, 3
662
663 xxspltw 32+4, 17, 0
664 xxspltw 32+5, 17, 1
665 xxspltw 32+6, 17, 2
666 xxspltw 32+7, 17, 3
667 xxspltw 32+8, 18, 0
668 xxspltw 32+9, 18, 1
669 xxspltw 32+10, 18, 2
670 xxspltw 32+11, 18, 3
671 xxspltw 32+12, 19, 0
672 xxspltw 32+13, 19, 1
673 xxspltw 32+14, 19, 2
674 xxspltw 32+15, 19, 3
675 vadduwm 12, 12, 30 # increase counter
676
677 xxspltw 32+16, 16, 0
678 xxspltw 32+17, 16, 1
679 xxspltw 32+18, 16, 2
680 xxspltw 32+19, 16, 3
681
682 xxspltw 32+20, 17, 0
683 xxspltw 32+21, 17, 1
684 xxspltw 32+22, 17, 2
685 xxspltw 32+23, 17, 3
686 xxspltw 32+24, 18, 0
687 xxspltw 32+25, 18, 1
688 xxspltw 32+26, 18, 2
689 xxspltw 32+27, 18, 3
690 xxspltw 32+28, 19, 0
691 xxspltw 32+29, 19, 1
692 vadduwm 28, 28, 31 # increase counter
693 xxspltw 32+30, 19, 2
694 xxspltw 32+31, 19, 3
695
696.align 5
697quarter_loop_8x:
698 QT_loop_8x
699
700 bdnz quarter_loop_8x
701
702 xxlor 0, 32+30, 32+30
703 xxlor 32+30, 30, 30
704 vadduwm 12, 12, 30
705 xxlor 32+30, 0, 0
706 TP_4x 0, 1, 2, 3
707 TP_4x 4, 5, 6, 7
708 TP_4x 8, 9, 10, 11
709 TP_4x 12, 13, 14, 15
710
711 xxlor 0, 48, 48
712 xxlor 1, 49, 49
713 xxlor 2, 50, 50
714 xxlor 3, 51, 51
715 xxlor 48, 16, 16
716 xxlor 49, 17, 17
717 xxlor 50, 18, 18
718 xxlor 51, 19, 19
719 Add_state 0
720 xxlor 48, 0, 0
721 xxlor 49, 1, 1
722 xxlor 50, 2, 2
723 xxlor 51, 3, 3
724 Write_256 0
725 addi 14, 14, 256 # offset +=256
726 addi 15, 15, -256 # len -=256
727
728 xxlor 5, 32+31, 32+31
729 xxlor 32+31, 31, 31
730 vadduwm 28, 28, 31
731 xxlor 32+31, 5, 5
732 TP_4x 16+0, 16+1, 16+2, 16+3
733 TP_4x 16+4, 16+5, 16+6, 16+7
734 TP_4x 16+8, 16+9, 16+10, 16+11
735 TP_4x 16+12, 16+13, 16+14, 16+15
736
737 xxlor 32, 16, 16
738 xxlor 33, 17, 17
739 xxlor 34, 18, 18
740 xxlor 35, 19, 19
741 Add_state 16
742 Write_256 16
743 addi 14, 14, 256 # offset +=256
744 addi 15, 15, -256 # len +=256
745
746 xxlor 32+24, 24, 24
747 xxlor 32+25, 25, 25
748 xxlor 32+30, 30, 30
749 vadduwm 30, 30, 25
750 vadduwm 31, 30, 24
751 xxlor 30, 32+30, 32+30
752 xxlor 31, 32+31, 32+31
753
754 cmpdi 15, 0
755 beq Out_loop
756
757 cmpdi 15, 512
758 blt Loop_last
759
760 mtctr 8
761 b Loop_8x
762
763Loop_last:
764 lxvw4x 48, 0, 3 # vr16, constants
765 lxvw4x 49, 17, 3 # vr17, key 1
766 lxvw4x 50, 18, 3 # vr18, key 2
767 lxvw4x 51, 19, 3 # vr19, counter, nonce
768
769 vspltisw 21, 12
770 vspltisw 23, 7
771 addis 11, 2, permx@toc@ha
772 addi 11, 11, permx@toc@l
773 lxvw4x 32+20, 0, 11
774 lxvw4x 32+22, 17, 11
775
776 sradi 8, 7, 1
777 mtctr 8
778
779Loop_4x:
780 vspltw 0, 16, 0
781 vspltw 1, 16, 1
782 vspltw 2, 16, 2
783 vspltw 3, 16, 3
784
785 vspltw 4, 17, 0
786 vspltw 5, 17, 1
787 vspltw 6, 17, 2
788 vspltw 7, 17, 3
789 vspltw 8, 18, 0
790 vspltw 9, 18, 1
791 vspltw 10, 18, 2
792 vspltw 11, 18, 3
793 vspltw 12, 19, 0
794 vadduwm 12, 12, 30 # increase counter
795 vspltw 13, 19, 1
796 vspltw 14, 19, 2
797 vspltw 15, 19, 3
798
799.align 5
800quarter_loop:
801 QT_loop_4x
802
803 bdnz quarter_loop
804
805 vadduwm 12, 12, 30
806 TP_4x 0, 1, 2, 3
807 TP_4x 4, 5, 6, 7
808 TP_4x 8, 9, 10, 11
809 TP_4x 12, 13, 14, 15
810
811 Add_state 0
812 Write_256 0
813 addi 14, 14, 256 # offset += 256
814 addi 15, 15, -256 # len += 256
815
816 # Update state counter
817 vspltisw 25, 4
818 vadduwm 30, 30, 25
819
820 cmpdi 15, 0
821 beq Out_loop
822 cmpdi 15, 256
823 blt Out_loop
824
825 mtctr 8
826 b Loop_4x
827
828Out_loop:
829 RESTORE_REGS
830 blr
831
832Out_no_chacha:
833 li 3, 0
834 blr
835SYM_FUNC_END(chacha_p10le_8x)
836
837SYM_DATA_START_LOCAL(PERMX)
838.align 5
839permx:
840.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
841.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
842SYM_DATA_END(PERMX)
843

source code of linux/arch/powerpc/crypto/chacha-p10le-8x.S