1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | # |
3 | # Accelerated chacha20 implementation for ppc64le. |
4 | # |
5 | # Copyright 2023- IBM Corp. All rights reserved |
6 | # |
7 | #=================================================================================== |
8 | # Written by Danny Tsen <dtsen@us.ibm.com> |
9 | # |
10 | # chacha_p10le_8x(u32 *state, byte *dst, const byte *src, |
11 | # size_t len, int nrounds); |
12 | # |
13 | # do rounds, 8 quarter rounds |
14 | # 1. a += b; d ^= a; d <<<= 16; |
15 | # 2. c += d; b ^= c; b <<<= 12; |
16 | # 3. a += b; d ^= a; d <<<= 8; |
17 | # 4. c += d; b ^= c; b <<<= 7 |
18 | # |
19 | # row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 |
20 | # row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 |
21 | # row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 |
22 | # row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 |
23 | # |
24 | # 4 blocks (a b c d) |
25 | # |
26 | # a0 b0 c0 d0 |
27 | # a1 b1 c1 d1 |
28 | # ... |
29 | # a4 b4 c4 d4 |
30 | # ... |
31 | # a8 b8 c8 d8 |
32 | # ... |
33 | # a12 b12 c12 d12 |
34 | # a13 ... |
35 | # a14 ... |
36 | # a15 b15 c15 d15 |
37 | # |
38 | # Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) |
39 | # Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) |
40 | # |
41 | |
42 | #include <asm/ppc_asm.h> |
43 | #include <asm/asm-offsets.h> |
44 | #include <asm/asm-compat.h> |
45 | #include <linux/linkage.h> |
46 | |
47 | .machine "any" |
48 | .text |
49 | |
50 | .macro SAVE_GPR GPR OFFSET FRAME |
51 | std \GPR,\OFFSET(\FRAME) |
52 | .endm |
53 | |
54 | .macro SAVE_VRS VRS OFFSET FRAME |
55 | li 16, \OFFSET |
56 | stvx \VRS, 16, \FRAME |
57 | .endm |
58 | |
59 | .macro SAVE_VSX VSX OFFSET FRAME |
60 | li 16, \OFFSET |
61 | stxvx \VSX, 16, \FRAME |
62 | .endm |
63 | |
64 | .macro RESTORE_GPR GPR OFFSET FRAME |
65 | ld \GPR,\OFFSET(\FRAME) |
66 | .endm |
67 | |
68 | .macro RESTORE_VRS VRS OFFSET FRAME |
69 | li 16, \OFFSET |
70 | lvx \VRS, 16, \FRAME |
71 | .endm |
72 | |
73 | .macro RESTORE_VSX VSX OFFSET FRAME |
74 | li 16, \OFFSET |
75 | lxvx \VSX, 16, \FRAME |
76 | .endm |
77 | |
78 | .macro SAVE_REGS |
79 | mflr 0 |
80 | std 0, 16(1) |
81 | stdu 1,-752(1) |
82 | |
83 | SAVE_GPR 14, 112, 1 |
84 | SAVE_GPR 15, 120, 1 |
85 | SAVE_GPR 16, 128, 1 |
86 | SAVE_GPR 17, 136, 1 |
87 | SAVE_GPR 18, 144, 1 |
88 | SAVE_GPR 19, 152, 1 |
89 | SAVE_GPR 20, 160, 1 |
90 | SAVE_GPR 21, 168, 1 |
91 | SAVE_GPR 22, 176, 1 |
92 | SAVE_GPR 23, 184, 1 |
93 | SAVE_GPR 24, 192, 1 |
94 | SAVE_GPR 25, 200, 1 |
95 | SAVE_GPR 26, 208, 1 |
96 | SAVE_GPR 27, 216, 1 |
97 | SAVE_GPR 28, 224, 1 |
98 | SAVE_GPR 29, 232, 1 |
99 | SAVE_GPR 30, 240, 1 |
100 | SAVE_GPR 31, 248, 1 |
101 | |
102 | addi 9, 1, 256 |
103 | SAVE_VRS 20, 0, 9 |
104 | SAVE_VRS 21, 16, 9 |
105 | SAVE_VRS 22, 32, 9 |
106 | SAVE_VRS 23, 48, 9 |
107 | SAVE_VRS 24, 64, 9 |
108 | SAVE_VRS 25, 80, 9 |
109 | SAVE_VRS 26, 96, 9 |
110 | SAVE_VRS 27, 112, 9 |
111 | SAVE_VRS 28, 128, 9 |
112 | SAVE_VRS 29, 144, 9 |
113 | SAVE_VRS 30, 160, 9 |
114 | SAVE_VRS 31, 176, 9 |
115 | |
116 | SAVE_VSX 14, 192, 9 |
117 | SAVE_VSX 15, 208, 9 |
118 | SAVE_VSX 16, 224, 9 |
119 | SAVE_VSX 17, 240, 9 |
120 | SAVE_VSX 18, 256, 9 |
121 | SAVE_VSX 19, 272, 9 |
122 | SAVE_VSX 20, 288, 9 |
123 | SAVE_VSX 21, 304, 9 |
124 | SAVE_VSX 22, 320, 9 |
125 | SAVE_VSX 23, 336, 9 |
126 | SAVE_VSX 24, 352, 9 |
127 | SAVE_VSX 25, 368, 9 |
128 | SAVE_VSX 26, 384, 9 |
129 | SAVE_VSX 27, 400, 9 |
130 | SAVE_VSX 28, 416, 9 |
131 | SAVE_VSX 29, 432, 9 |
132 | SAVE_VSX 30, 448, 9 |
133 | SAVE_VSX 31, 464, 9 |
134 | .endm # SAVE_REGS |
135 | |
136 | .macro RESTORE_REGS |
137 | addi 9, 1, 256 |
138 | RESTORE_VRS 20, 0, 9 |
139 | RESTORE_VRS 21, 16, 9 |
140 | RESTORE_VRS 22, 32, 9 |
141 | RESTORE_VRS 23, 48, 9 |
142 | RESTORE_VRS 24, 64, 9 |
143 | RESTORE_VRS 25, 80, 9 |
144 | RESTORE_VRS 26, 96, 9 |
145 | RESTORE_VRS 27, 112, 9 |
146 | RESTORE_VRS 28, 128, 9 |
147 | RESTORE_VRS 29, 144, 9 |
148 | RESTORE_VRS 30, 160, 9 |
149 | RESTORE_VRS 31, 176, 9 |
150 | |
151 | RESTORE_VSX 14, 192, 9 |
152 | RESTORE_VSX 15, 208, 9 |
153 | RESTORE_VSX 16, 224, 9 |
154 | RESTORE_VSX 17, 240, 9 |
155 | RESTORE_VSX 18, 256, 9 |
156 | RESTORE_VSX 19, 272, 9 |
157 | RESTORE_VSX 20, 288, 9 |
158 | RESTORE_VSX 21, 304, 9 |
159 | RESTORE_VSX 22, 320, 9 |
160 | RESTORE_VSX 23, 336, 9 |
161 | RESTORE_VSX 24, 352, 9 |
162 | RESTORE_VSX 25, 368, 9 |
163 | RESTORE_VSX 26, 384, 9 |
164 | RESTORE_VSX 27, 400, 9 |
165 | RESTORE_VSX 28, 416, 9 |
166 | RESTORE_VSX 29, 432, 9 |
167 | RESTORE_VSX 30, 448, 9 |
168 | RESTORE_VSX 31, 464, 9 |
169 | |
170 | RESTORE_GPR 14, 112, 1 |
171 | RESTORE_GPR 15, 120, 1 |
172 | RESTORE_GPR 16, 128, 1 |
173 | RESTORE_GPR 17, 136, 1 |
174 | RESTORE_GPR 18, 144, 1 |
175 | RESTORE_GPR 19, 152, 1 |
176 | RESTORE_GPR 20, 160, 1 |
177 | RESTORE_GPR 21, 168, 1 |
178 | RESTORE_GPR 22, 176, 1 |
179 | RESTORE_GPR 23, 184, 1 |
180 | RESTORE_GPR 24, 192, 1 |
181 | RESTORE_GPR 25, 200, 1 |
182 | RESTORE_GPR 26, 208, 1 |
183 | RESTORE_GPR 27, 216, 1 |
184 | RESTORE_GPR 28, 224, 1 |
185 | RESTORE_GPR 29, 232, 1 |
186 | RESTORE_GPR 30, 240, 1 |
187 | RESTORE_GPR 31, 248, 1 |
188 | |
189 | addi 1, 1, 752 |
190 | ld 0, 16(1) |
191 | mtlr 0 |
192 | .endm # RESTORE_REGS |
193 | |
194 | .macro QT_loop_8x |
195 | # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) |
196 | xxlor 0, 32+25, 32+25 |
197 | xxlor 32+25, 20, 20 |
198 | vadduwm 0, 0, 4 |
199 | vadduwm 1, 1, 5 |
200 | vadduwm 2, 2, 6 |
201 | vadduwm 3, 3, 7 |
202 | vadduwm 16, 16, 20 |
203 | vadduwm 17, 17, 21 |
204 | vadduwm 18, 18, 22 |
205 | vadduwm 19, 19, 23 |
206 | |
207 | vpermxor 12, 12, 0, 25 |
208 | vpermxor 13, 13, 1, 25 |
209 | vpermxor 14, 14, 2, 25 |
210 | vpermxor 15, 15, 3, 25 |
211 | vpermxor 28, 28, 16, 25 |
212 | vpermxor 29, 29, 17, 25 |
213 | vpermxor 30, 30, 18, 25 |
214 | vpermxor 31, 31, 19, 25 |
215 | xxlor 32+25, 0, 0 |
216 | vadduwm 8, 8, 12 |
217 | vadduwm 9, 9, 13 |
218 | vadduwm 10, 10, 14 |
219 | vadduwm 11, 11, 15 |
220 | vadduwm 24, 24, 28 |
221 | vadduwm 25, 25, 29 |
222 | vadduwm 26, 26, 30 |
223 | vadduwm 27, 27, 31 |
224 | vxor 4, 4, 8 |
225 | vxor 5, 5, 9 |
226 | vxor 6, 6, 10 |
227 | vxor 7, 7, 11 |
228 | vxor 20, 20, 24 |
229 | vxor 21, 21, 25 |
230 | vxor 22, 22, 26 |
231 | vxor 23, 23, 27 |
232 | |
233 | xxlor 0, 32+25, 32+25 |
234 | xxlor 32+25, 21, 21 |
235 | vrlw 4, 4, 25 # |
236 | vrlw 5, 5, 25 |
237 | vrlw 6, 6, 25 |
238 | vrlw 7, 7, 25 |
239 | vrlw 20, 20, 25 # |
240 | vrlw 21, 21, 25 |
241 | vrlw 22, 22, 25 |
242 | vrlw 23, 23, 25 |
243 | xxlor 32+25, 0, 0 |
244 | vadduwm 0, 0, 4 |
245 | vadduwm 1, 1, 5 |
246 | vadduwm 2, 2, 6 |
247 | vadduwm 3, 3, 7 |
248 | vadduwm 16, 16, 20 |
249 | vadduwm 17, 17, 21 |
250 | vadduwm 18, 18, 22 |
251 | vadduwm 19, 19, 23 |
252 | |
253 | xxlor 0, 32+25, 32+25 |
254 | xxlor 32+25, 22, 22 |
255 | vpermxor 12, 12, 0, 25 |
256 | vpermxor 13, 13, 1, 25 |
257 | vpermxor 14, 14, 2, 25 |
258 | vpermxor 15, 15, 3, 25 |
259 | vpermxor 28, 28, 16, 25 |
260 | vpermxor 29, 29, 17, 25 |
261 | vpermxor 30, 30, 18, 25 |
262 | vpermxor 31, 31, 19, 25 |
263 | xxlor 32+25, 0, 0 |
264 | vadduwm 8, 8, 12 |
265 | vadduwm 9, 9, 13 |
266 | vadduwm 10, 10, 14 |
267 | vadduwm 11, 11, 15 |
268 | vadduwm 24, 24, 28 |
269 | vadduwm 25, 25, 29 |
270 | vadduwm 26, 26, 30 |
271 | vadduwm 27, 27, 31 |
272 | xxlor 0, 32+28, 32+28 |
273 | xxlor 32+28, 23, 23 |
274 | vxor 4, 4, 8 |
275 | vxor 5, 5, 9 |
276 | vxor 6, 6, 10 |
277 | vxor 7, 7, 11 |
278 | vxor 20, 20, 24 |
279 | vxor 21, 21, 25 |
280 | vxor 22, 22, 26 |
281 | vxor 23, 23, 27 |
282 | vrlw 4, 4, 28 # |
283 | vrlw 5, 5, 28 |
284 | vrlw 6, 6, 28 |
285 | vrlw 7, 7, 28 |
286 | vrlw 20, 20, 28 # |
287 | vrlw 21, 21, 28 |
288 | vrlw 22, 22, 28 |
289 | vrlw 23, 23, 28 |
290 | xxlor 32+28, 0, 0 |
291 | |
292 | # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) |
293 | xxlor 0, 32+25, 32+25 |
294 | xxlor 32+25, 20, 20 |
295 | vadduwm 0, 0, 5 |
296 | vadduwm 1, 1, 6 |
297 | vadduwm 2, 2, 7 |
298 | vadduwm 3, 3, 4 |
299 | vadduwm 16, 16, 21 |
300 | vadduwm 17, 17, 22 |
301 | vadduwm 18, 18, 23 |
302 | vadduwm 19, 19, 20 |
303 | |
304 | vpermxor 15, 15, 0, 25 |
305 | vpermxor 12, 12, 1, 25 |
306 | vpermxor 13, 13, 2, 25 |
307 | vpermxor 14, 14, 3, 25 |
308 | vpermxor 31, 31, 16, 25 |
309 | vpermxor 28, 28, 17, 25 |
310 | vpermxor 29, 29, 18, 25 |
311 | vpermxor 30, 30, 19, 25 |
312 | |
313 | xxlor 32+25, 0, 0 |
314 | vadduwm 10, 10, 15 |
315 | vadduwm 11, 11, 12 |
316 | vadduwm 8, 8, 13 |
317 | vadduwm 9, 9, 14 |
318 | vadduwm 26, 26, 31 |
319 | vadduwm 27, 27, 28 |
320 | vadduwm 24, 24, 29 |
321 | vadduwm 25, 25, 30 |
322 | vxor 5, 5, 10 |
323 | vxor 6, 6, 11 |
324 | vxor 7, 7, 8 |
325 | vxor 4, 4, 9 |
326 | vxor 21, 21, 26 |
327 | vxor 22, 22, 27 |
328 | vxor 23, 23, 24 |
329 | vxor 20, 20, 25 |
330 | |
331 | xxlor 0, 32+25, 32+25 |
332 | xxlor 32+25, 21, 21 |
333 | vrlw 5, 5, 25 |
334 | vrlw 6, 6, 25 |
335 | vrlw 7, 7, 25 |
336 | vrlw 4, 4, 25 |
337 | vrlw 21, 21, 25 |
338 | vrlw 22, 22, 25 |
339 | vrlw 23, 23, 25 |
340 | vrlw 20, 20, 25 |
341 | xxlor 32+25, 0, 0 |
342 | |
343 | vadduwm 0, 0, 5 |
344 | vadduwm 1, 1, 6 |
345 | vadduwm 2, 2, 7 |
346 | vadduwm 3, 3, 4 |
347 | vadduwm 16, 16, 21 |
348 | vadduwm 17, 17, 22 |
349 | vadduwm 18, 18, 23 |
350 | vadduwm 19, 19, 20 |
351 | |
352 | xxlor 0, 32+25, 32+25 |
353 | xxlor 32+25, 22, 22 |
354 | vpermxor 15, 15, 0, 25 |
355 | vpermxor 12, 12, 1, 25 |
356 | vpermxor 13, 13, 2, 25 |
357 | vpermxor 14, 14, 3, 25 |
358 | vpermxor 31, 31, 16, 25 |
359 | vpermxor 28, 28, 17, 25 |
360 | vpermxor 29, 29, 18, 25 |
361 | vpermxor 30, 30, 19, 25 |
362 | xxlor 32+25, 0, 0 |
363 | |
364 | vadduwm 10, 10, 15 |
365 | vadduwm 11, 11, 12 |
366 | vadduwm 8, 8, 13 |
367 | vadduwm 9, 9, 14 |
368 | vadduwm 26, 26, 31 |
369 | vadduwm 27, 27, 28 |
370 | vadduwm 24, 24, 29 |
371 | vadduwm 25, 25, 30 |
372 | |
373 | xxlor 0, 32+28, 32+28 |
374 | xxlor 32+28, 23, 23 |
375 | vxor 5, 5, 10 |
376 | vxor 6, 6, 11 |
377 | vxor 7, 7, 8 |
378 | vxor 4, 4, 9 |
379 | vxor 21, 21, 26 |
380 | vxor 22, 22, 27 |
381 | vxor 23, 23, 24 |
382 | vxor 20, 20, 25 |
383 | vrlw 5, 5, 28 |
384 | vrlw 6, 6, 28 |
385 | vrlw 7, 7, 28 |
386 | vrlw 4, 4, 28 |
387 | vrlw 21, 21, 28 |
388 | vrlw 22, 22, 28 |
389 | vrlw 23, 23, 28 |
390 | vrlw 20, 20, 28 |
391 | xxlor 32+28, 0, 0 |
392 | .endm |
393 | |
394 | .macro QT_loop_4x |
395 | # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) |
396 | vadduwm 0, 0, 4 |
397 | vadduwm 1, 1, 5 |
398 | vadduwm 2, 2, 6 |
399 | vadduwm 3, 3, 7 |
400 | vpermxor 12, 12, 0, 20 |
401 | vpermxor 13, 13, 1, 20 |
402 | vpermxor 14, 14, 2, 20 |
403 | vpermxor 15, 15, 3, 20 |
404 | vadduwm 8, 8, 12 |
405 | vadduwm 9, 9, 13 |
406 | vadduwm 10, 10, 14 |
407 | vadduwm 11, 11, 15 |
408 | vxor 4, 4, 8 |
409 | vxor 5, 5, 9 |
410 | vxor 6, 6, 10 |
411 | vxor 7, 7, 11 |
412 | vrlw 4, 4, 21 |
413 | vrlw 5, 5, 21 |
414 | vrlw 6, 6, 21 |
415 | vrlw 7, 7, 21 |
416 | vadduwm 0, 0, 4 |
417 | vadduwm 1, 1, 5 |
418 | vadduwm 2, 2, 6 |
419 | vadduwm 3, 3, 7 |
420 | vpermxor 12, 12, 0, 22 |
421 | vpermxor 13, 13, 1, 22 |
422 | vpermxor 14, 14, 2, 22 |
423 | vpermxor 15, 15, 3, 22 |
424 | vadduwm 8, 8, 12 |
425 | vadduwm 9, 9, 13 |
426 | vadduwm 10, 10, 14 |
427 | vadduwm 11, 11, 15 |
428 | vxor 4, 4, 8 |
429 | vxor 5, 5, 9 |
430 | vxor 6, 6, 10 |
431 | vxor 7, 7, 11 |
432 | vrlw 4, 4, 23 |
433 | vrlw 5, 5, 23 |
434 | vrlw 6, 6, 23 |
435 | vrlw 7, 7, 23 |
436 | |
437 | # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) |
438 | vadduwm 0, 0, 5 |
439 | vadduwm 1, 1, 6 |
440 | vadduwm 2, 2, 7 |
441 | vadduwm 3, 3, 4 |
442 | vpermxor 15, 15, 0, 20 |
443 | vpermxor 12, 12, 1, 20 |
444 | vpermxor 13, 13, 2, 20 |
445 | vpermxor 14, 14, 3, 20 |
446 | vadduwm 10, 10, 15 |
447 | vadduwm 11, 11, 12 |
448 | vadduwm 8, 8, 13 |
449 | vadduwm 9, 9, 14 |
450 | vxor 5, 5, 10 |
451 | vxor 6, 6, 11 |
452 | vxor 7, 7, 8 |
453 | vxor 4, 4, 9 |
454 | vrlw 5, 5, 21 |
455 | vrlw 6, 6, 21 |
456 | vrlw 7, 7, 21 |
457 | vrlw 4, 4, 21 |
458 | vadduwm 0, 0, 5 |
459 | vadduwm 1, 1, 6 |
460 | vadduwm 2, 2, 7 |
461 | vadduwm 3, 3, 4 |
462 | vpermxor 15, 15, 0, 22 |
463 | vpermxor 12, 12, 1, 22 |
464 | vpermxor 13, 13, 2, 22 |
465 | vpermxor 14, 14, 3, 22 |
466 | vadduwm 10, 10, 15 |
467 | vadduwm 11, 11, 12 |
468 | vadduwm 8, 8, 13 |
469 | vadduwm 9, 9, 14 |
470 | vxor 5, 5, 10 |
471 | vxor 6, 6, 11 |
472 | vxor 7, 7, 8 |
473 | vxor 4, 4, 9 |
474 | vrlw 5, 5, 23 |
475 | vrlw 6, 6, 23 |
476 | vrlw 7, 7, 23 |
477 | vrlw 4, 4, 23 |
478 | .endm |
479 | |
480 | # Transpose |
481 | .macro TP_4x a0 a1 a2 a3 |
482 | xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 |
483 | xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 |
484 | xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 |
485 | xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 |
486 | xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 |
487 | xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 |
488 | xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 |
489 | xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 |
490 | .endm |
491 | |
492 | # key stream = working state + state |
493 | .macro Add_state S |
494 | vadduwm \S+0, \S+0, 16-\S |
495 | vadduwm \S+4, \S+4, 17-\S |
496 | vadduwm \S+8, \S+8, 18-\S |
497 | vadduwm \S+12, \S+12, 19-\S |
498 | |
499 | vadduwm \S+1, \S+1, 16-\S |
500 | vadduwm \S+5, \S+5, 17-\S |
501 | vadduwm \S+9, \S+9, 18-\S |
502 | vadduwm \S+13, \S+13, 19-\S |
503 | |
504 | vadduwm \S+2, \S+2, 16-\S |
505 | vadduwm \S+6, \S+6, 17-\S |
506 | vadduwm \S+10, \S+10, 18-\S |
507 | vadduwm \S+14, \S+14, 19-\S |
508 | |
509 | vadduwm \S+3, \S+3, 16-\S |
510 | vadduwm \S+7, \S+7, 17-\S |
511 | vadduwm \S+11, \S+11, 18-\S |
512 | vadduwm \S+15, \S+15, 19-\S |
513 | .endm |
514 | |
515 | # |
516 | # write 256 bytes |
517 | # |
518 | .macro Write_256 S |
519 | add 9, 14, 5 |
520 | add 16, 14, 4 |
521 | lxvw4x 0, 0, 9 |
522 | lxvw4x 1, 17, 9 |
523 | lxvw4x 2, 18, 9 |
524 | lxvw4x 3, 19, 9 |
525 | lxvw4x 4, 20, 9 |
526 | lxvw4x 5, 21, 9 |
527 | lxvw4x 6, 22, 9 |
528 | lxvw4x 7, 23, 9 |
529 | lxvw4x 8, 24, 9 |
530 | lxvw4x 9, 25, 9 |
531 | lxvw4x 10, 26, 9 |
532 | lxvw4x 11, 27, 9 |
533 | lxvw4x 12, 28, 9 |
534 | lxvw4x 13, 29, 9 |
535 | lxvw4x 14, 30, 9 |
536 | lxvw4x 15, 31, 9 |
537 | |
538 | xxlxor \S+32, \S+32, 0 |
539 | xxlxor \S+36, \S+36, 1 |
540 | xxlxor \S+40, \S+40, 2 |
541 | xxlxor \S+44, \S+44, 3 |
542 | xxlxor \S+33, \S+33, 4 |
543 | xxlxor \S+37, \S+37, 5 |
544 | xxlxor \S+41, \S+41, 6 |
545 | xxlxor \S+45, \S+45, 7 |
546 | xxlxor \S+34, \S+34, 8 |
547 | xxlxor \S+38, \S+38, 9 |
548 | xxlxor \S+42, \S+42, 10 |
549 | xxlxor \S+46, \S+46, 11 |
550 | xxlxor \S+35, \S+35, 12 |
551 | xxlxor \S+39, \S+39, 13 |
552 | xxlxor \S+43, \S+43, 14 |
553 | xxlxor \S+47, \S+47, 15 |
554 | |
555 | stxvw4x \S+32, 0, 16 |
556 | stxvw4x \S+36, 17, 16 |
557 | stxvw4x \S+40, 18, 16 |
558 | stxvw4x \S+44, 19, 16 |
559 | |
560 | stxvw4x \S+33, 20, 16 |
561 | stxvw4x \S+37, 21, 16 |
562 | stxvw4x \S+41, 22, 16 |
563 | stxvw4x \S+45, 23, 16 |
564 | |
565 | stxvw4x \S+34, 24, 16 |
566 | stxvw4x \S+38, 25, 16 |
567 | stxvw4x \S+42, 26, 16 |
568 | stxvw4x \S+46, 27, 16 |
569 | |
570 | stxvw4x \S+35, 28, 16 |
571 | stxvw4x \S+39, 29, 16 |
572 | stxvw4x \S+43, 30, 16 |
573 | stxvw4x \S+47, 31, 16 |
574 | |
575 | .endm |
576 | |
577 | # |
578 | # chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds); |
579 | # |
580 | SYM_FUNC_START(chacha_p10le_8x) |
581 | .align 5 |
582 | cmpdi 6, 0 |
583 | ble Out_no_chacha |
584 | |
585 | SAVE_REGS |
586 | |
587 | # r17 - r31 mainly for Write_256 macro. |
588 | li 17, 16 |
589 | li 18, 32 |
590 | li 19, 48 |
591 | li 20, 64 |
592 | li 21, 80 |
593 | li 22, 96 |
594 | li 23, 112 |
595 | li 24, 128 |
596 | li 25, 144 |
597 | li 26, 160 |
598 | li 27, 176 |
599 | li 28, 192 |
600 | li 29, 208 |
601 | li 30, 224 |
602 | li 31, 240 |
603 | |
604 | mr 15, 6 # len |
605 | li 14, 0 # offset to inp and outp |
606 | |
607 | lxvw4x 48, 0, 3 # vr16, constants |
608 | lxvw4x 49, 17, 3 # vr17, key 1 |
609 | lxvw4x 50, 18, 3 # vr18, key 2 |
610 | lxvw4x 51, 19, 3 # vr19, counter, nonce |
611 | |
612 | # create (0, 1, 2, 3) counters |
613 | vspltisw 0, 0 |
614 | vspltisw 1, 1 |
615 | vspltisw 2, 2 |
616 | vspltisw 3, 3 |
617 | vmrghw 4, 0, 1 |
618 | vmrglw 5, 2, 3 |
619 | vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) |
620 | |
621 | vspltisw 21, 12 |
622 | vspltisw 23, 7 |
623 | |
624 | addis 11, 2, permx@toc@ha |
625 | addi 11, 11, permx@toc@l |
626 | lxvw4x 32+20, 0, 11 |
627 | lxvw4x 32+22, 17, 11 |
628 | |
629 | sradi 8, 7, 1 |
630 | |
631 | mtctr 8 |
632 | |
633 | # save constants to vsx |
634 | xxlor 16, 48, 48 |
635 | xxlor 17, 49, 49 |
636 | xxlor 18, 50, 50 |
637 | xxlor 19, 51, 51 |
638 | |
639 | vspltisw 25, 4 |
640 | vspltisw 26, 8 |
641 | |
642 | xxlor 25, 32+26, 32+26 |
643 | xxlor 24, 32+25, 32+25 |
644 | |
645 | vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) |
646 | xxlor 30, 32+30, 32+30 |
647 | xxlor 31, 32+31, 32+31 |
648 | |
649 | xxlor 20, 32+20, 32+20 |
650 | xxlor 21, 32+21, 32+21 |
651 | xxlor 22, 32+22, 32+22 |
652 | xxlor 23, 32+23, 32+23 |
653 | |
654 | cmpdi 6, 512 |
655 | blt Loop_last |
656 | |
657 | Loop_8x: |
658 | xxspltw 32+0, 16, 0 |
659 | xxspltw 32+1, 16, 1 |
660 | xxspltw 32+2, 16, 2 |
661 | xxspltw 32+3, 16, 3 |
662 | |
663 | xxspltw 32+4, 17, 0 |
664 | xxspltw 32+5, 17, 1 |
665 | xxspltw 32+6, 17, 2 |
666 | xxspltw 32+7, 17, 3 |
667 | xxspltw 32+8, 18, 0 |
668 | xxspltw 32+9, 18, 1 |
669 | xxspltw 32+10, 18, 2 |
670 | xxspltw 32+11, 18, 3 |
671 | xxspltw 32+12, 19, 0 |
672 | xxspltw 32+13, 19, 1 |
673 | xxspltw 32+14, 19, 2 |
674 | xxspltw 32+15, 19, 3 |
675 | vadduwm 12, 12, 30 # increase counter |
676 | |
677 | xxspltw 32+16, 16, 0 |
678 | xxspltw 32+17, 16, 1 |
679 | xxspltw 32+18, 16, 2 |
680 | xxspltw 32+19, 16, 3 |
681 | |
682 | xxspltw 32+20, 17, 0 |
683 | xxspltw 32+21, 17, 1 |
684 | xxspltw 32+22, 17, 2 |
685 | xxspltw 32+23, 17, 3 |
686 | xxspltw 32+24, 18, 0 |
687 | xxspltw 32+25, 18, 1 |
688 | xxspltw 32+26, 18, 2 |
689 | xxspltw 32+27, 18, 3 |
690 | xxspltw 32+28, 19, 0 |
691 | xxspltw 32+29, 19, 1 |
692 | vadduwm 28, 28, 31 # increase counter |
693 | xxspltw 32+30, 19, 2 |
694 | xxspltw 32+31, 19, 3 |
695 | |
696 | .align 5 |
697 | quarter_loop_8x: |
698 | QT_loop_8x |
699 | |
700 | bdnz quarter_loop_8x |
701 | |
702 | xxlor 0, 32+30, 32+30 |
703 | xxlor 32+30, 30, 30 |
704 | vadduwm 12, 12, 30 |
705 | xxlor 32+30, 0, 0 |
706 | TP_4x 0, 1, 2, 3 |
707 | TP_4x 4, 5, 6, 7 |
708 | TP_4x 8, 9, 10, 11 |
709 | TP_4x 12, 13, 14, 15 |
710 | |
711 | xxlor 0, 48, 48 |
712 | xxlor 1, 49, 49 |
713 | xxlor 2, 50, 50 |
714 | xxlor 3, 51, 51 |
715 | xxlor 48, 16, 16 |
716 | xxlor 49, 17, 17 |
717 | xxlor 50, 18, 18 |
718 | xxlor 51, 19, 19 |
719 | Add_state 0 |
720 | xxlor 48, 0, 0 |
721 | xxlor 49, 1, 1 |
722 | xxlor 50, 2, 2 |
723 | xxlor 51, 3, 3 |
724 | Write_256 0 |
725 | addi 14, 14, 256 # offset +=256 |
726 | addi 15, 15, -256 # len -=256 |
727 | |
728 | xxlor 5, 32+31, 32+31 |
729 | xxlor 32+31, 31, 31 |
730 | vadduwm 28, 28, 31 |
731 | xxlor 32+31, 5, 5 |
732 | TP_4x 16+0, 16+1, 16+2, 16+3 |
733 | TP_4x 16+4, 16+5, 16+6, 16+7 |
734 | TP_4x 16+8, 16+9, 16+10, 16+11 |
735 | TP_4x 16+12, 16+13, 16+14, 16+15 |
736 | |
737 | xxlor 32, 16, 16 |
738 | xxlor 33, 17, 17 |
739 | xxlor 34, 18, 18 |
740 | xxlor 35, 19, 19 |
741 | Add_state 16 |
742 | Write_256 16 |
743 | addi 14, 14, 256 # offset +=256 |
744 | addi 15, 15, -256 # len +=256 |
745 | |
746 | xxlor 32+24, 24, 24 |
747 | xxlor 32+25, 25, 25 |
748 | xxlor 32+30, 30, 30 |
749 | vadduwm 30, 30, 25 |
750 | vadduwm 31, 30, 24 |
751 | xxlor 30, 32+30, 32+30 |
752 | xxlor 31, 32+31, 32+31 |
753 | |
754 | cmpdi 15, 0 |
755 | beq Out_loop |
756 | |
757 | cmpdi 15, 512 |
758 | blt Loop_last |
759 | |
760 | mtctr 8 |
761 | b Loop_8x |
762 | |
763 | Loop_last: |
764 | lxvw4x 48, 0, 3 # vr16, constants |
765 | lxvw4x 49, 17, 3 # vr17, key 1 |
766 | lxvw4x 50, 18, 3 # vr18, key 2 |
767 | lxvw4x 51, 19, 3 # vr19, counter, nonce |
768 | |
769 | vspltisw 21, 12 |
770 | vspltisw 23, 7 |
771 | addis 11, 2, permx@toc@ha |
772 | addi 11, 11, permx@toc@l |
773 | lxvw4x 32+20, 0, 11 |
774 | lxvw4x 32+22, 17, 11 |
775 | |
776 | sradi 8, 7, 1 |
777 | mtctr 8 |
778 | |
779 | Loop_4x: |
780 | vspltw 0, 16, 0 |
781 | vspltw 1, 16, 1 |
782 | vspltw 2, 16, 2 |
783 | vspltw 3, 16, 3 |
784 | |
785 | vspltw 4, 17, 0 |
786 | vspltw 5, 17, 1 |
787 | vspltw 6, 17, 2 |
788 | vspltw 7, 17, 3 |
789 | vspltw 8, 18, 0 |
790 | vspltw 9, 18, 1 |
791 | vspltw 10, 18, 2 |
792 | vspltw 11, 18, 3 |
793 | vspltw 12, 19, 0 |
794 | vadduwm 12, 12, 30 # increase counter |
795 | vspltw 13, 19, 1 |
796 | vspltw 14, 19, 2 |
797 | vspltw 15, 19, 3 |
798 | |
799 | .align 5 |
800 | quarter_loop: |
801 | QT_loop_4x |
802 | |
803 | bdnz quarter_loop |
804 | |
805 | vadduwm 12, 12, 30 |
806 | TP_4x 0, 1, 2, 3 |
807 | TP_4x 4, 5, 6, 7 |
808 | TP_4x 8, 9, 10, 11 |
809 | TP_4x 12, 13, 14, 15 |
810 | |
811 | Add_state 0 |
812 | Write_256 0 |
813 | addi 14, 14, 256 # offset += 256 |
814 | addi 15, 15, -256 # len += 256 |
815 | |
816 | # Update state counter |
817 | vspltisw 25, 4 |
818 | vadduwm 30, 30, 25 |
819 | |
820 | cmpdi 15, 0 |
821 | beq Out_loop |
822 | cmpdi 15, 256 |
823 | blt Out_loop |
824 | |
825 | mtctr 8 |
826 | b Loop_4x |
827 | |
828 | Out_loop: |
829 | RESTORE_REGS |
830 | blr |
831 | |
832 | Out_no_chacha: |
833 | li 3, 0 |
834 | blr |
835 | SYM_FUNC_END(chacha_p10le_8x) |
836 | |
837 | SYM_DATA_START_LOCAL(PERMX) |
838 | .align 5 |
839 | permx: |
840 | .long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd |
841 | .long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc |
842 | SYM_DATA_END(PERMX) |
843 | |