1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#
3# Accelerated poly1305 implementation for ppc64le.
4#
5# Copyright 2023- IBM Corp. All rights reserved
6#
7#===================================================================================
8# Written by Danny Tsen <dtsen@us.ibm.com>
9#
10# Poly1305 - this version mainly using vector/VSX/Scalar
11# - 26 bits limbs
12# - Handle multiple 64 byte blcok.
13#
14# Block size 16 bytes
15# key = (r, s)
16# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
17# p = 2^130 - 5
18# a += m
19# a = (r + a) % p
20# a += s
21#
22# Improve performance by breaking down polynominal to the sum of products with
23# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
24#
25# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
26# to 9 vectors for multiplications.
27#
28# setup r^4, r^3, r^2, r vectors
29# vs [r^1, r^3, r^2, r^4]
30# vs0 = [r0,.....]
31# vs1 = [r1,.....]
32# vs2 = [r2,.....]
33# vs3 = [r3,.....]
34# vs4 = [r4,.....]
35# vs5 = [r1*5,...]
36# vs6 = [r2*5,...]
37# vs7 = [r2*5,...]
38# vs8 = [r4*5,...]
39#
40# Each word in a vector consists a member of a "r/s" in [a * r/s].
41#
42# r0, r4*5, r3*5, r2*5, r1*5;
43# r1, r0, r4*5, r3*5, r2*5;
44# r2, r1, r0, r4*5, r3*5;
45# r3, r2, r1, r0, r4*5;
46# r4, r3, r2, r1, r0 ;
47#
48#
49# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
50# k = 32 bytes key
51# r3 = k (r, s)
52# r4 = mlen
53# r5 = m
54#
55#include <asm/ppc_asm.h>
56#include <asm/asm-offsets.h>
57#include <asm/asm-compat.h>
58#include <linux/linkage.h>
59
60.machine "any"
61
62.text
63
64.macro SAVE_GPR GPR OFFSET FRAME
65 std \GPR,\OFFSET(\FRAME)
66.endm
67
68.macro SAVE_VRS VRS OFFSET FRAME
69 li 16, \OFFSET
70 stvx \VRS, 16, \FRAME
71.endm
72
73.macro SAVE_VSX VSX OFFSET FRAME
74 li 16, \OFFSET
75 stxvx \VSX, 16, \FRAME
76.endm
77
78.macro RESTORE_GPR GPR OFFSET FRAME
79 ld \GPR,\OFFSET(\FRAME)
80.endm
81
82.macro RESTORE_VRS VRS OFFSET FRAME
83 li 16, \OFFSET
84 lvx \VRS, 16, \FRAME
85.endm
86
87.macro RESTORE_VSX VSX OFFSET FRAME
88 li 16, \OFFSET
89 lxvx \VSX, 16, \FRAME
90.endm
91
92.macro SAVE_REGS
93 mflr 0
94 std 0, 16(1)
95 stdu 1,-752(1)
96
97 SAVE_GPR 14, 112, 1
98 SAVE_GPR 15, 120, 1
99 SAVE_GPR 16, 128, 1
100 SAVE_GPR 17, 136, 1
101 SAVE_GPR 18, 144, 1
102 SAVE_GPR 19, 152, 1
103 SAVE_GPR 20, 160, 1
104 SAVE_GPR 21, 168, 1
105 SAVE_GPR 22, 176, 1
106 SAVE_GPR 23, 184, 1
107 SAVE_GPR 24, 192, 1
108 SAVE_GPR 25, 200, 1
109 SAVE_GPR 26, 208, 1
110 SAVE_GPR 27, 216, 1
111 SAVE_GPR 28, 224, 1
112 SAVE_GPR 29, 232, 1
113 SAVE_GPR 30, 240, 1
114 SAVE_GPR 31, 248, 1
115
116 addi 9, 1, 256
117 SAVE_VRS 20, 0, 9
118 SAVE_VRS 21, 16, 9
119 SAVE_VRS 22, 32, 9
120 SAVE_VRS 23, 48, 9
121 SAVE_VRS 24, 64, 9
122 SAVE_VRS 25, 80, 9
123 SAVE_VRS 26, 96, 9
124 SAVE_VRS 27, 112, 9
125 SAVE_VRS 28, 128, 9
126 SAVE_VRS 29, 144, 9
127 SAVE_VRS 30, 160, 9
128 SAVE_VRS 31, 176, 9
129
130 SAVE_VSX 14, 192, 9
131 SAVE_VSX 15, 208, 9
132 SAVE_VSX 16, 224, 9
133 SAVE_VSX 17, 240, 9
134 SAVE_VSX 18, 256, 9
135 SAVE_VSX 19, 272, 9
136 SAVE_VSX 20, 288, 9
137 SAVE_VSX 21, 304, 9
138 SAVE_VSX 22, 320, 9
139 SAVE_VSX 23, 336, 9
140 SAVE_VSX 24, 352, 9
141 SAVE_VSX 25, 368, 9
142 SAVE_VSX 26, 384, 9
143 SAVE_VSX 27, 400, 9
144 SAVE_VSX 28, 416, 9
145 SAVE_VSX 29, 432, 9
146 SAVE_VSX 30, 448, 9
147 SAVE_VSX 31, 464, 9
148.endm # SAVE_REGS
149
150.macro RESTORE_REGS
151 addi 9, 1, 256
152 RESTORE_VRS 20, 0, 9
153 RESTORE_VRS 21, 16, 9
154 RESTORE_VRS 22, 32, 9
155 RESTORE_VRS 23, 48, 9
156 RESTORE_VRS 24, 64, 9
157 RESTORE_VRS 25, 80, 9
158 RESTORE_VRS 26, 96, 9
159 RESTORE_VRS 27, 112, 9
160 RESTORE_VRS 28, 128, 9
161 RESTORE_VRS 29, 144, 9
162 RESTORE_VRS 30, 160, 9
163 RESTORE_VRS 31, 176, 9
164
165 RESTORE_VSX 14, 192, 9
166 RESTORE_VSX 15, 208, 9
167 RESTORE_VSX 16, 224, 9
168 RESTORE_VSX 17, 240, 9
169 RESTORE_VSX 18, 256, 9
170 RESTORE_VSX 19, 272, 9
171 RESTORE_VSX 20, 288, 9
172 RESTORE_VSX 21, 304, 9
173 RESTORE_VSX 22, 320, 9
174 RESTORE_VSX 23, 336, 9
175 RESTORE_VSX 24, 352, 9
176 RESTORE_VSX 25, 368, 9
177 RESTORE_VSX 26, 384, 9
178 RESTORE_VSX 27, 400, 9
179 RESTORE_VSX 28, 416, 9
180 RESTORE_VSX 29, 432, 9
181 RESTORE_VSX 30, 448, 9
182 RESTORE_VSX 31, 464, 9
183
184 RESTORE_GPR 14, 112, 1
185 RESTORE_GPR 15, 120, 1
186 RESTORE_GPR 16, 128, 1
187 RESTORE_GPR 17, 136, 1
188 RESTORE_GPR 18, 144, 1
189 RESTORE_GPR 19, 152, 1
190 RESTORE_GPR 20, 160, 1
191 RESTORE_GPR 21, 168, 1
192 RESTORE_GPR 22, 176, 1
193 RESTORE_GPR 23, 184, 1
194 RESTORE_GPR 24, 192, 1
195 RESTORE_GPR 25, 200, 1
196 RESTORE_GPR 26, 208, 1
197 RESTORE_GPR 27, 216, 1
198 RESTORE_GPR 28, 224, 1
199 RESTORE_GPR 29, 232, 1
200 RESTORE_GPR 30, 240, 1
201 RESTORE_GPR 31, 248, 1
202
203 addi 1, 1, 752
204 ld 0, 16(1)
205 mtlr 0
206.endm # RESTORE_REGS
207
208#
209# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
210# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
211# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
212# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
213# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
214#
215# [r^2, r^3, r^1, r^4]
216# [m3, m2, m4, m1]
217#
218# multiply odd and even words
219.macro mul_odd
220 vmulouw 14, 4, 26
221 vmulouw 10, 5, 3
222 vmulouw 11, 6, 2
223 vmulouw 12, 7, 1
224 vmulouw 13, 8, 0
225 vmulouw 15, 4, 27
226 vaddudm 14, 14, 10
227 vaddudm 14, 14, 11
228 vmulouw 10, 5, 26
229 vmulouw 11, 6, 3
230 vaddudm 14, 14, 12
231 vaddudm 14, 14, 13 # x0
232 vaddudm 15, 15, 10
233 vaddudm 15, 15, 11
234 vmulouw 12, 7, 2
235 vmulouw 13, 8, 1
236 vaddudm 15, 15, 12
237 vaddudm 15, 15, 13 # x1
238 vmulouw 16, 4, 28
239 vmulouw 10, 5, 27
240 vmulouw 11, 6, 26
241 vaddudm 16, 16, 10
242 vaddudm 16, 16, 11
243 vmulouw 12, 7, 3
244 vmulouw 13, 8, 2
245 vaddudm 16, 16, 12
246 vaddudm 16, 16, 13 # x2
247 vmulouw 17, 4, 29
248 vmulouw 10, 5, 28
249 vmulouw 11, 6, 27
250 vaddudm 17, 17, 10
251 vaddudm 17, 17, 11
252 vmulouw 12, 7, 26
253 vmulouw 13, 8, 3
254 vaddudm 17, 17, 12
255 vaddudm 17, 17, 13 # x3
256 vmulouw 18, 4, 30
257 vmulouw 10, 5, 29
258 vmulouw 11, 6, 28
259 vaddudm 18, 18, 10
260 vaddudm 18, 18, 11
261 vmulouw 12, 7, 27
262 vmulouw 13, 8, 26
263 vaddudm 18, 18, 12
264 vaddudm 18, 18, 13 # x4
265.endm
266
267.macro mul_even
268 vmuleuw 9, 4, 26
269 vmuleuw 10, 5, 3
270 vmuleuw 11, 6, 2
271 vmuleuw 12, 7, 1
272 vmuleuw 13, 8, 0
273 vaddudm 14, 14, 9
274 vaddudm 14, 14, 10
275 vaddudm 14, 14, 11
276 vaddudm 14, 14, 12
277 vaddudm 14, 14, 13 # x0
278
279 vmuleuw 9, 4, 27
280 vmuleuw 10, 5, 26
281 vmuleuw 11, 6, 3
282 vmuleuw 12, 7, 2
283 vmuleuw 13, 8, 1
284 vaddudm 15, 15, 9
285 vaddudm 15, 15, 10
286 vaddudm 15, 15, 11
287 vaddudm 15, 15, 12
288 vaddudm 15, 15, 13 # x1
289
290 vmuleuw 9, 4, 28
291 vmuleuw 10, 5, 27
292 vmuleuw 11, 6, 26
293 vmuleuw 12, 7, 3
294 vmuleuw 13, 8, 2
295 vaddudm 16, 16, 9
296 vaddudm 16, 16, 10
297 vaddudm 16, 16, 11
298 vaddudm 16, 16, 12
299 vaddudm 16, 16, 13 # x2
300
301 vmuleuw 9, 4, 29
302 vmuleuw 10, 5, 28
303 vmuleuw 11, 6, 27
304 vmuleuw 12, 7, 26
305 vmuleuw 13, 8, 3
306 vaddudm 17, 17, 9
307 vaddudm 17, 17, 10
308 vaddudm 17, 17, 11
309 vaddudm 17, 17, 12
310 vaddudm 17, 17, 13 # x3
311
312 vmuleuw 9, 4, 30
313 vmuleuw 10, 5, 29
314 vmuleuw 11, 6, 28
315 vmuleuw 12, 7, 27
316 vmuleuw 13, 8, 26
317 vaddudm 18, 18, 9
318 vaddudm 18, 18, 10
319 vaddudm 18, 18, 11
320 vaddudm 18, 18, 12
321 vaddudm 18, 18, 13 # x4
322.endm
323
324#
325# poly1305_setup_r
326#
327# setup r^4, r^3, r^2, r vectors
328# [r, r^3, r^2, r^4]
329# vs0 = [r0,...]
330# vs1 = [r1,...]
331# vs2 = [r2,...]
332# vs3 = [r3,...]
333# vs4 = [r4,...]
334# vs5 = [r4*5,...]
335# vs6 = [r3*5,...]
336# vs7 = [r2*5,...]
337# vs8 = [r1*5,...]
338#
339# r0, r4*5, r3*5, r2*5, r1*5;
340# r1, r0, r4*5, r3*5, r2*5;
341# r2, r1, r0, r4*5, r3*5;
342# r3, r2, r1, r0, r4*5;
343# r4, r3, r2, r1, r0 ;
344#
345.macro poly1305_setup_r
346
347 # save r
348 xxlor 26, 58, 58
349 xxlor 27, 59, 59
350 xxlor 28, 60, 60
351 xxlor 29, 61, 61
352 xxlor 30, 62, 62
353
354 xxlxor 31, 31, 31
355
356# [r, r^3, r^2, r^4]
357 # compute r^2
358 vmr 4, 26
359 vmr 5, 27
360 vmr 6, 28
361 vmr 7, 29
362 vmr 8, 30
363 bl do_mul # r^2 r^1
364 xxpermdi 58, 58, 36, 0x3 # r0
365 xxpermdi 59, 59, 37, 0x3 # r1
366 xxpermdi 60, 60, 38, 0x3 # r2
367 xxpermdi 61, 61, 39, 0x3 # r3
368 xxpermdi 62, 62, 40, 0x3 # r4
369 xxpermdi 36, 36, 36, 0x3
370 xxpermdi 37, 37, 37, 0x3
371 xxpermdi 38, 38, 38, 0x3
372 xxpermdi 39, 39, 39, 0x3
373 xxpermdi 40, 40, 40, 0x3
374 vspltisb 13, 2
375 vsld 9, 27, 13
376 vsld 10, 28, 13
377 vsld 11, 29, 13
378 vsld 12, 30, 13
379 vaddudm 0, 9, 27
380 vaddudm 1, 10, 28
381 vaddudm 2, 11, 29
382 vaddudm 3, 12, 30
383
384 bl do_mul # r^4 r^3
385 vmrgow 26, 26, 4
386 vmrgow 27, 27, 5
387 vmrgow 28, 28, 6
388 vmrgow 29, 29, 7
389 vmrgow 30, 30, 8
390 vspltisb 13, 2
391 vsld 9, 27, 13
392 vsld 10, 28, 13
393 vsld 11, 29, 13
394 vsld 12, 30, 13
395 vaddudm 0, 9, 27
396 vaddudm 1, 10, 28
397 vaddudm 2, 11, 29
398 vaddudm 3, 12, 30
399
400 # r^2 r^4
401 xxlor 0, 58, 58
402 xxlor 1, 59, 59
403 xxlor 2, 60, 60
404 xxlor 3, 61, 61
405 xxlor 4, 62, 62
406 xxlor 5, 32, 32
407 xxlor 6, 33, 33
408 xxlor 7, 34, 34
409 xxlor 8, 35, 35
410
411 vspltw 9, 26, 3
412 vspltw 10, 26, 2
413 vmrgow 26, 10, 9
414 vspltw 9, 27, 3
415 vspltw 10, 27, 2
416 vmrgow 27, 10, 9
417 vspltw 9, 28, 3
418 vspltw 10, 28, 2
419 vmrgow 28, 10, 9
420 vspltw 9, 29, 3
421 vspltw 10, 29, 2
422 vmrgow 29, 10, 9
423 vspltw 9, 30, 3
424 vspltw 10, 30, 2
425 vmrgow 30, 10, 9
426
427 vsld 9, 27, 13
428 vsld 10, 28, 13
429 vsld 11, 29, 13
430 vsld 12, 30, 13
431 vaddudm 0, 9, 27
432 vaddudm 1, 10, 28
433 vaddudm 2, 11, 29
434 vaddudm 3, 12, 30
435.endm
436
437SYM_FUNC_START_LOCAL(do_mul)
438 mul_odd
439
440 # do reduction ( h %= p )
441 # carry reduction
442 vspltisb 9, 2
443 vsrd 10, 14, 31
444 vsrd 11, 17, 31
445 vand 7, 17, 25
446 vand 4, 14, 25
447 vaddudm 18, 18, 11
448 vsrd 12, 18, 31
449 vaddudm 15, 15, 10
450
451 vsrd 11, 15, 31
452 vand 8, 18, 25
453 vand 5, 15, 25
454 vaddudm 4, 4, 12
455 vsld 10, 12, 9
456 vaddudm 6, 16, 11
457
458 vsrd 13, 6, 31
459 vand 6, 6, 25
460 vaddudm 4, 4, 10
461 vsrd 10, 4, 31
462 vaddudm 7, 7, 13
463
464 vsrd 11, 7, 31
465 vand 7, 7, 25
466 vand 4, 4, 25
467 vaddudm 5, 5, 10
468 vaddudm 8, 8, 11
469 blr
470SYM_FUNC_END(do_mul)
471
472#
473# init key
474#
475.macro do_poly1305_init
476 addis 10, 2, rmask@toc@ha
477 addi 10, 10, rmask@toc@l
478
479 ld 11, 0(10)
480 ld 12, 8(10)
481
482 li 14, 16
483 li 15, 32
484 addis 10, 2, cnum@toc@ha
485 addi 10, 10, cnum@toc@l
486 lvx 25, 0, 10 # v25 - mask
487 lvx 31, 14, 10 # v31 = 1a
488 lvx 19, 15, 10 # v19 = 1 << 24
489 lxv 24, 48(10) # vs24
490 lxv 25, 64(10) # vs25
491
492 # initialize
493 # load key from r3 to vectors
494 ld 9, 24(3)
495 ld 10, 32(3)
496 and. 9, 9, 11
497 and. 10, 10, 12
498
499 # break 26 bits
500 extrdi 14, 9, 26, 38
501 extrdi 15, 9, 26, 12
502 extrdi 16, 9, 12, 0
503 mtvsrdd 58, 0, 14
504 insrdi 16, 10, 14, 38
505 mtvsrdd 59, 0, 15
506 extrdi 17, 10, 26, 24
507 mtvsrdd 60, 0, 16
508 extrdi 18, 10, 24, 0
509 mtvsrdd 61, 0, 17
510 mtvsrdd 62, 0, 18
511
512 # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
513 li 9, 5
514 mtvsrdd 36, 0, 9
515 vmulouw 0, 27, 4 # v0 = rr0
516 vmulouw 1, 28, 4 # v1 = rr1
517 vmulouw 2, 29, 4 # v2 = rr2
518 vmulouw 3, 30, 4 # v3 = rr3
519.endm
520
521#
522# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
523# k = 32 bytes key
524# r3 = k (r, s)
525# r4 = mlen
526# r5 = m
527#
528SYM_FUNC_START(poly1305_p10le_4blocks)
529.align 5
530 cmpdi 5, 64
531 blt Out_no_poly1305
532
533 SAVE_REGS
534
535 do_poly1305_init
536
537 li 21, 0 # counter to message
538
539 poly1305_setup_r
540
541 # load previous H state
542 # break/convert r6 to 26 bits
543 ld 9, 0(3)
544 ld 10, 8(3)
545 ld 19, 16(3)
546 sldi 19, 19, 24
547 mtvsrdd 41, 0, 19
548 extrdi 14, 9, 26, 38
549 extrdi 15, 9, 26, 12
550 extrdi 16, 9, 12, 0
551 mtvsrdd 36, 0, 14
552 insrdi 16, 10, 14, 38
553 mtvsrdd 37, 0, 15
554 extrdi 17, 10, 26, 24
555 mtvsrdd 38, 0, 16
556 extrdi 18, 10, 24, 0
557 mtvsrdd 39, 0, 17
558 mtvsrdd 40, 0, 18
559 vor 8, 8, 9
560
561 # input m1 m2
562 add 20, 4, 21
563 xxlor 49, 24, 24
564 xxlor 50, 25, 25
565 lxvw4x 43, 0, 20
566 addi 17, 20, 16
567 lxvw4x 44, 0, 17
568 vperm 14, 11, 12, 17
569 vperm 15, 11, 12, 18
570 vand 9, 14, 25 # a0
571 vsrd 10, 14, 31 # >> 26
572 vsrd 11, 10, 31 # 12 bits left
573 vand 10, 10, 25 # a1
574 vspltisb 13, 12
575 vand 16, 15, 25
576 vsld 12, 16, 13
577 vor 11, 11, 12
578 vand 11, 11, 25 # a2
579 vspltisb 13, 14
580 vsrd 12, 15, 13 # >> 14
581 vsrd 13, 12, 31 # >> 26, a4
582 vand 12, 12, 25 # a3
583
584 vaddudm 20, 4, 9
585 vaddudm 21, 5, 10
586 vaddudm 22, 6, 11
587 vaddudm 23, 7, 12
588 vaddudm 24, 8, 13
589
590 # m3 m4
591 addi 17, 17, 16
592 lxvw4x 43, 0, 17
593 addi 17, 17, 16
594 lxvw4x 44, 0, 17
595 vperm 14, 11, 12, 17
596 vperm 15, 11, 12, 18
597 vand 9, 14, 25 # a0
598 vsrd 10, 14, 31 # >> 26
599 vsrd 11, 10, 31 # 12 bits left
600 vand 10, 10, 25 # a1
601 vspltisb 13, 12
602 vand 16, 15, 25
603 vsld 12, 16, 13
604 vspltisb 13, 14
605 vor 11, 11, 12
606 vand 11, 11, 25 # a2
607 vsrd 12, 15, 13 # >> 14
608 vsrd 13, 12, 31 # >> 26, a4
609 vand 12, 12, 25 # a3
610
611 # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
612 vmrgow 4, 9, 20
613 vmrgow 5, 10, 21
614 vmrgow 6, 11, 22
615 vmrgow 7, 12, 23
616 vmrgow 8, 13, 24
617 vaddudm 8, 8, 19
618
619 addi 5, 5, -64 # len -= 64
620 addi 21, 21, 64 # offset += 64
621
622 li 9, 64
623 divdu 31, 5, 9
624
625 cmpdi 31, 0
626 ble Skip_block_loop
627
628 mtctr 31
629
630# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
631# Rewrite the polynominal sum of product as follows,
632# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
633# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
634# .... Repeat
635# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
636# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
637#
638loop_4blocks:
639
640 # Multiply odd words and even words
641 mul_odd
642 mul_even
643 # carry reduction
644 vspltisb 9, 2
645 vsrd 10, 14, 31
646 vsrd 11, 17, 31
647 vand 7, 17, 25
648 vand 4, 14, 25
649 vaddudm 18, 18, 11
650 vsrd 12, 18, 31
651 vaddudm 15, 15, 10
652
653 vsrd 11, 15, 31
654 vand 8, 18, 25
655 vand 5, 15, 25
656 vaddudm 4, 4, 12
657 vsld 10, 12, 9
658 vaddudm 6, 16, 11
659
660 vsrd 13, 6, 31
661 vand 6, 6, 25
662 vaddudm 4, 4, 10
663 vsrd 10, 4, 31
664 vaddudm 7, 7, 13
665
666 vsrd 11, 7, 31
667 vand 7, 7, 25
668 vand 4, 4, 25
669 vaddudm 5, 5, 10
670 vaddudm 8, 8, 11
671
672 # input m1 m2 m3 m4
673 add 20, 4, 21
674 xxlor 49, 24, 24
675 xxlor 50, 25, 25
676 lxvw4x 43, 0, 20
677 addi 17, 20, 16
678 lxvw4x 44, 0, 17
679 vperm 14, 11, 12, 17
680 vperm 15, 11, 12, 18
681 addi 17, 17, 16
682 lxvw4x 43, 0, 17
683 addi 17, 17, 16
684 lxvw4x 44, 0, 17
685 vperm 17, 11, 12, 17
686 vperm 18, 11, 12, 18
687
688 vand 20, 14, 25 # a0
689 vand 9, 17, 25 # a0
690 vsrd 21, 14, 31 # >> 26
691 vsrd 22, 21, 31 # 12 bits left
692 vsrd 10, 17, 31 # >> 26
693 vsrd 11, 10, 31 # 12 bits left
694
695 vand 21, 21, 25 # a1
696 vand 10, 10, 25 # a1
697
698 vspltisb 13, 12
699 vand 16, 15, 25
700 vsld 23, 16, 13
701 vor 22, 22, 23
702 vand 22, 22, 25 # a2
703 vand 16, 18, 25
704 vsld 12, 16, 13
705 vor 11, 11, 12
706 vand 11, 11, 25 # a2
707 vspltisb 13, 14
708 vsrd 23, 15, 13 # >> 14
709 vsrd 24, 23, 31 # >> 26, a4
710 vand 23, 23, 25 # a3
711 vsrd 12, 18, 13 # >> 14
712 vsrd 13, 12, 31 # >> 26, a4
713 vand 12, 12, 25 # a3
714
715 vaddudm 4, 4, 20
716 vaddudm 5, 5, 21
717 vaddudm 6, 6, 22
718 vaddudm 7, 7, 23
719 vaddudm 8, 8, 24
720
721 # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
722 vmrgow 4, 9, 4
723 vmrgow 5, 10, 5
724 vmrgow 6, 11, 6
725 vmrgow 7, 12, 7
726 vmrgow 8, 13, 8
727 vaddudm 8, 8, 19
728
729 addi 5, 5, -64 # len -= 64
730 addi 21, 21, 64 # offset += 64
731
732 bdnz loop_4blocks
733
734Skip_block_loop:
735 xxlor 58, 0, 0
736 xxlor 59, 1, 1
737 xxlor 60, 2, 2
738 xxlor 61, 3, 3
739 xxlor 62, 4, 4
740 xxlor 32, 5, 5
741 xxlor 33, 6, 6
742 xxlor 34, 7, 7
743 xxlor 35, 8, 8
744
745 # Multiply odd words and even words
746 mul_odd
747 mul_even
748
749 # Sum the products.
750 xxpermdi 41, 31, 46, 0
751 xxpermdi 42, 31, 47, 0
752 vaddudm 4, 14, 9
753 xxpermdi 36, 31, 36, 3
754 vaddudm 5, 15, 10
755 xxpermdi 37, 31, 37, 3
756 xxpermdi 43, 31, 48, 0
757 vaddudm 6, 16, 11
758 xxpermdi 38, 31, 38, 3
759 xxpermdi 44, 31, 49, 0
760 vaddudm 7, 17, 12
761 xxpermdi 39, 31, 39, 3
762 xxpermdi 45, 31, 50, 0
763 vaddudm 8, 18, 13
764 xxpermdi 40, 31, 40, 3
765
766 # carry reduction
767 vspltisb 9, 2
768 vsrd 10, 4, 31
769 vsrd 11, 7, 31
770 vand 7, 7, 25
771 vand 4, 4, 25
772 vaddudm 8, 8, 11
773 vsrd 12, 8, 31
774 vaddudm 5, 5, 10
775
776 vsrd 11, 5, 31
777 vand 8, 8, 25
778 vand 5, 5, 25
779 vaddudm 4, 4, 12
780 vsld 10, 12, 9
781 vaddudm 6, 6, 11
782
783 vsrd 13, 6, 31
784 vand 6, 6, 25
785 vaddudm 4, 4, 10
786 vsrd 10, 4, 31
787 vaddudm 7, 7, 13
788
789 vsrd 11, 7, 31
790 vand 7, 7, 25
791 vand 4, 4, 25
792 vaddudm 5, 5, 10
793 vsrd 10, 5, 31
794 vand 5, 5, 25
795 vaddudm 6, 6, 10
796 vaddudm 8, 8, 11
797
798 b do_final_update
799
800do_final_update:
801 # combine 26 bit limbs
802 # v4, v5, v6, v7 and v8 are 26 bit vectors
803 vsld 5, 5, 31
804 vor 20, 4, 5
805 vspltisb 11, 12
806 vsrd 12, 6, 11
807 vsld 6, 6, 31
808 vsld 6, 6, 31
809 vor 20, 20, 6
810 vspltisb 11, 14
811 vsld 7, 7, 11
812 vor 21, 7, 12
813 mfvsrld 16, 40 # save last 2 bytes
814 vsld 8, 8, 11
815 vsld 8, 8, 31
816 vor 21, 21, 8
817 mfvsrld 17, 52
818 mfvsrld 19, 53
819 srdi 16, 16, 24
820
821 std 17, 0(3)
822 std 19, 8(3)
823 stw 16, 16(3)
824
825Out_loop:
826 li 3, 0
827
828 RESTORE_REGS
829
830 blr
831
832Out_no_poly1305:
833 li 3, 0
834 blr
835SYM_FUNC_END(poly1305_p10le_4blocks)
836
837#
838# =======================================================================
839# The following functions implement 64 x 64 bits multiplication poly1305.
840#
841SYM_FUNC_START_LOCAL(Poly1305_init_64)
842 # mask 0x0FFFFFFC0FFFFFFC
843 # mask 0x0FFFFFFC0FFFFFFF
844 addis 10, 2, rmask@toc@ha
845 addi 10, 10, rmask@toc@l
846 ld 11, 0(10)
847 ld 12, 8(10)
848
849 # initialize
850 # load key from r3
851 ld 9, 24(3)
852 ld 10, 32(3)
853 and. 9, 9, 11 # cramp mask r0
854 and. 10, 10, 12 # cramp mask r1
855
856 srdi 21, 10, 2
857 add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
858
859 # setup r and s
860 li 25, 0
861 mtvsrdd 32+0, 9, 19 # r0, s1
862 mtvsrdd 32+1, 10, 9 # r1, r0
863 mtvsrdd 32+2, 19, 25 # s1
864 mtvsrdd 32+3, 9, 25 # r0
865
866 blr
867SYM_FUNC_END(Poly1305_init_64)
868
869# Poly1305_mult
870# v6 = (h0, h1), v8 = h2
871# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
872#
873# Output: v7, v10, v11
874#
875SYM_FUNC_START_LOCAL(Poly1305_mult)
876 #
877 # d0 = h0 * r0 + h1 * s1
878 vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
879
880 # d1 = h0 * r1 + h1 * r0 + h2 * s1
881 vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
882 vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
883
884 # d2 = r0
885 vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
886 blr
887SYM_FUNC_END(Poly1305_mult)
888
889#
890# carry reduction
891# h %=p
892#
893# Input: v7, v10, v11
894# Output: r27, r28, r29
895#
896SYM_FUNC_START_LOCAL(Carry_reduction)
897 mfvsrld 27, 32+7
898 mfvsrld 28, 32+10
899 mfvsrld 29, 32+11
900 mfvsrd 20, 32+7 # h0.h
901 mfvsrd 21, 32+10 # h1.h
902
903 addc 28, 28, 20
904 adde 29, 29, 21
905 srdi 22, 29, 0x2
906 sldi 23, 22, 0x2
907 add 23, 23, 22 # (h2 & 3) * 5
908 addc 27, 27, 23 # h0
909 addze 28, 28 # h1
910 andi. 29, 29, 0x3 # h2
911 blr
912SYM_FUNC_END(Carry_reduction)
913
914#
915# poly1305 multiplication
916# h *= r, h %= p
917# d0 = h0 * r0 + h1 * s1
918# d1 = h0 * r1 + h1 * r0 + h2 * s1
919# d2 = h0 * r0
920#
921#
922# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
923# - no highbit if final leftover block (highbit = 0)
924#
925SYM_FUNC_START(poly1305_64s)
926 cmpdi 5, 0
927 ble Out_no_poly1305_64
928
929 mflr 0
930 std 0, 16(1)
931 stdu 1,-400(1)
932
933 SAVE_GPR 14, 112, 1
934 SAVE_GPR 15, 120, 1
935 SAVE_GPR 16, 128, 1
936 SAVE_GPR 17, 136, 1
937 SAVE_GPR 18, 144, 1
938 SAVE_GPR 19, 152, 1
939 SAVE_GPR 20, 160, 1
940 SAVE_GPR 21, 168, 1
941 SAVE_GPR 22, 176, 1
942 SAVE_GPR 23, 184, 1
943 SAVE_GPR 24, 192, 1
944 SAVE_GPR 25, 200, 1
945 SAVE_GPR 26, 208, 1
946 SAVE_GPR 27, 216, 1
947 SAVE_GPR 28, 224, 1
948 SAVE_GPR 29, 232, 1
949 SAVE_GPR 30, 240, 1
950 SAVE_GPR 31, 248, 1
951
952 # Init poly1305
953 bl Poly1305_init_64
954
955 li 25, 0 # offset to inp and outp
956
957 add 11, 25, 4
958
959 # load h
960 # h0, h1, h2?
961 ld 27, 0(3)
962 ld 28, 8(3)
963 lwz 29, 16(3)
964
965 li 30, 16
966 divdu 31, 5, 30
967
968 mtctr 31
969
970 mr 24, 6 # highbit
971
972Loop_block_64:
973 vxor 9, 9, 9
974
975 ld 20, 0(11)
976 ld 21, 8(11)
977 addi 11, 11, 16
978
979 addc 27, 27, 20
980 adde 28, 28, 21
981 adde 29, 29, 24
982
983 li 22, 0
984 mtvsrdd 32+6, 27, 28 # h0, h1
985 mtvsrdd 32+8, 29, 22 # h2
986
987 bl Poly1305_mult
988
989 bl Carry_reduction
990
991 bdnz Loop_block_64
992
993 std 27, 0(3)
994 std 28, 8(3)
995 stw 29, 16(3)
996
997 li 3, 0
998
999 RESTORE_GPR 14, 112, 1
1000 RESTORE_GPR 15, 120, 1
1001 RESTORE_GPR 16, 128, 1
1002 RESTORE_GPR 17, 136, 1
1003 RESTORE_GPR 18, 144, 1
1004 RESTORE_GPR 19, 152, 1
1005 RESTORE_GPR 20, 160, 1
1006 RESTORE_GPR 21, 168, 1
1007 RESTORE_GPR 22, 176, 1
1008 RESTORE_GPR 23, 184, 1
1009 RESTORE_GPR 24, 192, 1
1010 RESTORE_GPR 25, 200, 1
1011 RESTORE_GPR 26, 208, 1
1012 RESTORE_GPR 27, 216, 1
1013 RESTORE_GPR 28, 224, 1
1014 RESTORE_GPR 29, 232, 1
1015 RESTORE_GPR 30, 240, 1
1016 RESTORE_GPR 31, 248, 1
1017
1018 addi 1, 1, 400
1019 ld 0, 16(1)
1020 mtlr 0
1021
1022 blr
1023
1024Out_no_poly1305_64:
1025 li 3, 0
1026 blr
1027SYM_FUNC_END(poly1305_64s)
1028
1029#
1030# Input: r3 = h, r4 = s, r5 = mac
1031# mac = h + s
1032#
1033SYM_FUNC_START(poly1305_emit_64)
1034 ld 10, 0(3)
1035 ld 11, 8(3)
1036 ld 12, 16(3)
1037
1038 # compare modulus
1039 # h + 5 + (-p)
1040 mr 6, 10
1041 mr 7, 11
1042 mr 8, 12
1043 addic. 6, 6, 5
1044 addze 7, 7
1045 addze 8, 8
1046 srdi 9, 8, 2 # overflow?
1047 cmpdi 9, 0
1048 beq Skip_h64
1049 mr 10, 6
1050 mr 11, 7
1051 mr 12, 8
1052
1053Skip_h64:
1054 ld 6, 0(4)
1055 ld 7, 8(4)
1056 addc 10, 10, 6
1057 adde 11, 11, 7
1058 addze 12, 12
1059
1060 std 10, 0(5)
1061 std 11, 8(5)
1062 blr
1063SYM_FUNC_END(poly1305_emit_64)
1064
1065SYM_DATA_START_LOCAL(RMASK)
1066.align 5
1067rmask:
1068.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
1069cnum:
1070.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
1071.long 0x1a, 0x00, 0x1a, 0x00
1072.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
1073.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
1074.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
1075SYM_DATA_END(RMASK)
1076

source code of linux/arch/powerpc/crypto/poly1305-p10le_64.S