1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Core of the accelerated CRC algorithm.
4 * In your file, define the constants and CRC_FUNCTION_NAME
5 * Then include this file.
6 *
7 * Calculate the checksum of data that is 16 byte aligned and a multiple of
8 * 16 bytes.
9 *
10 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
11 * chunks in order to mask the latency of the vpmsum instructions. If we
12 * have more than 32 kB of data to checksum we repeat this step multiple
13 * times, passing in the previous 1024 bits.
14 *
15 * The next step is to reduce the 1024 bits to 64 bits. This step adds
16 * 32 bits of 0s to the end - this matches what a CRC does. We just
17 * calculate constants that land the data in this 32 bits.
18 *
19 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
20 * for n = CRC using POWER8 instructions. We use x = 32.
21 *
22 * https://en.wikipedia.org/wiki/Barrett_reduction
23 *
24 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
25*/
26
27#include <asm/ppc_asm.h>
28#include <asm/ppc-opcode.h>
29
30#define MAX_SIZE 32768
31
32 .text
33
34#if defined(__BIG_ENDIAN__) && defined(REFLECT)
35#define BYTESWAP_DATA
36#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37#define BYTESWAP_DATA
38#else
39#undef BYTESWAP_DATA
40#endif
41
42#define off16 r25
43#define off32 r26
44#define off48 r27
45#define off64 r28
46#define off80 r29
47#define off96 r30
48#define off112 r31
49
50#define const1 v24
51#define const2 v25
52
53#define byteswap v26
54#define mask_32bit v27
55#define mask_64bit v28
56#define zeroes v29
57
58#ifdef BYTESWAP_DATA
59#define VPERM(A, B, C, D) vperm A, B, C, D
60#else
61#define VPERM(A, B, C, D)
62#endif
63
64/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
65FUNC_START(CRC_FUNCTION_NAME)
66 std r31,-8(r1)
67 std r30,-16(r1)
68 std r29,-24(r1)
69 std r28,-32(r1)
70 std r27,-40(r1)
71 std r26,-48(r1)
72 std r25,-56(r1)
73
74 li off16,16
75 li off32,32
76 li off48,48
77 li off64,64
78 li off80,80
79 li off96,96
80 li off112,112
81 li r0,0
82
83 /* Enough room for saving 10 non volatile VMX registers */
84 subi r6,r1,56+10*16
85 subi r7,r1,56+2*16
86
87 stvx v20,0,r6
88 stvx v21,off16,r6
89 stvx v22,off32,r6
90 stvx v23,off48,r6
91 stvx v24,off64,r6
92 stvx v25,off80,r6
93 stvx v26,off96,r6
94 stvx v27,off112,r6
95 stvx v28,0,r7
96 stvx v29,off16,r7
97
98 mr r10,r3
99
100 vxor zeroes,zeroes,zeroes
101 vspltisw v0,-1
102
103 vsldoi mask_32bit,zeroes,v0,4
104 vsldoi mask_64bit,zeroes,v0,8
105
106 /* Get the initial value into v8 */
107 vxor v8,v8,v8
108 MTVRD(v8, R3)
109#ifdef REFLECT
110 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
111#else
112 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
113#endif
114
115#ifdef BYTESWAP_DATA
116 LOAD_REG_ADDR(r3, .byteswap_constant)
117 lvx byteswap,0,r3
118 addi r3,r3,16
119#endif
120
121 cmpdi r5,256
122 blt .Lshort
123
124 rldicr r6,r5,0,56
125
126 /* Checksum in blocks of MAX_SIZE */
1271: lis r7,MAX_SIZE@h
128 ori r7,r7,MAX_SIZE@l
129 mr r9,r7
130 cmpd r6,r7
131 bgt 2f
132 mr r7,r6
1332: subf r6,r7,r6
134
135 /* our main loop does 128 bytes at a time */
136 srdi r7,r7,7
137
138 /*
139 * Work out the offset into the constants table to start at. Each
140 * constant is 16 bytes, and it is used against 128 bytes of input
141 * data - 128 / 16 = 8
142 */
143 sldi r8,r7,4
144 srdi r9,r9,3
145 subf r8,r8,r9
146
147 /* We reduce our final 128 bytes in a separate step */
148 addi r7,r7,-1
149 mtctr r7
150
151 LOAD_REG_ADDR(r3, .constants)
152
153 /* Find the start of our constants */
154 add r3,r3,r8
155
156 /* zero v0-v7 which will contain our checksums */
157 vxor v0,v0,v0
158 vxor v1,v1,v1
159 vxor v2,v2,v2
160 vxor v3,v3,v3
161 vxor v4,v4,v4
162 vxor v5,v5,v5
163 vxor v6,v6,v6
164 vxor v7,v7,v7
165
166 lvx const1,0,r3
167
168 /*
169 * If we are looping back to consume more data we use the values
170 * already in v16-v23.
171 */
172 cmpdi r0,1
173 beq 2f
174
175 /* First warm up pass */
176 lvx v16,0,r4
177 lvx v17,off16,r4
178 VPERM(v16,v16,v16,byteswap)
179 VPERM(v17,v17,v17,byteswap)
180 lvx v18,off32,r4
181 lvx v19,off48,r4
182 VPERM(v18,v18,v18,byteswap)
183 VPERM(v19,v19,v19,byteswap)
184 lvx v20,off64,r4
185 lvx v21,off80,r4
186 VPERM(v20,v20,v20,byteswap)
187 VPERM(v21,v21,v21,byteswap)
188 lvx v22,off96,r4
189 lvx v23,off112,r4
190 VPERM(v22,v22,v22,byteswap)
191 VPERM(v23,v23,v23,byteswap)
192 addi r4,r4,8*16
193
194 /* xor in initial value */
195 vxor v16,v16,v8
196
1972: bdz .Lfirst_warm_up_done
198
199 addi r3,r3,16
200 lvx const2,0,r3
201
202 /* Second warm up pass */
203 VPMSUMD(v8,v16,const1)
204 lvx v16,0,r4
205 VPERM(v16,v16,v16,byteswap)
206 ori r2,r2,0
207
208 VPMSUMD(v9,v17,const1)
209 lvx v17,off16,r4
210 VPERM(v17,v17,v17,byteswap)
211 ori r2,r2,0
212
213 VPMSUMD(v10,v18,const1)
214 lvx v18,off32,r4
215 VPERM(v18,v18,v18,byteswap)
216 ori r2,r2,0
217
218 VPMSUMD(v11,v19,const1)
219 lvx v19,off48,r4
220 VPERM(v19,v19,v19,byteswap)
221 ori r2,r2,0
222
223 VPMSUMD(v12,v20,const1)
224 lvx v20,off64,r4
225 VPERM(v20,v20,v20,byteswap)
226 ori r2,r2,0
227
228 VPMSUMD(v13,v21,const1)
229 lvx v21,off80,r4
230 VPERM(v21,v21,v21,byteswap)
231 ori r2,r2,0
232
233 VPMSUMD(v14,v22,const1)
234 lvx v22,off96,r4
235 VPERM(v22,v22,v22,byteswap)
236 ori r2,r2,0
237
238 VPMSUMD(v15,v23,const1)
239 lvx v23,off112,r4
240 VPERM(v23,v23,v23,byteswap)
241
242 addi r4,r4,8*16
243
244 bdz .Lfirst_cool_down
245
246 /*
247 * main loop. We modulo schedule it such that it takes three iterations
248 * to complete - first iteration load, second iteration vpmsum, third
249 * iteration xor.
250 */
251 .balign 16
2524: lvx const1,0,r3
253 addi r3,r3,16
254 ori r2,r2,0
255
256 vxor v0,v0,v8
257 VPMSUMD(v8,v16,const2)
258 lvx v16,0,r4
259 VPERM(v16,v16,v16,byteswap)
260 ori r2,r2,0
261
262 vxor v1,v1,v9
263 VPMSUMD(v9,v17,const2)
264 lvx v17,off16,r4
265 VPERM(v17,v17,v17,byteswap)
266 ori r2,r2,0
267
268 vxor v2,v2,v10
269 VPMSUMD(v10,v18,const2)
270 lvx v18,off32,r4
271 VPERM(v18,v18,v18,byteswap)
272 ori r2,r2,0
273
274 vxor v3,v3,v11
275 VPMSUMD(v11,v19,const2)
276 lvx v19,off48,r4
277 VPERM(v19,v19,v19,byteswap)
278 lvx const2,0,r3
279 ori r2,r2,0
280
281 vxor v4,v4,v12
282 VPMSUMD(v12,v20,const1)
283 lvx v20,off64,r4
284 VPERM(v20,v20,v20,byteswap)
285 ori r2,r2,0
286
287 vxor v5,v5,v13
288 VPMSUMD(v13,v21,const1)
289 lvx v21,off80,r4
290 VPERM(v21,v21,v21,byteswap)
291 ori r2,r2,0
292
293 vxor v6,v6,v14
294 VPMSUMD(v14,v22,const1)
295 lvx v22,off96,r4
296 VPERM(v22,v22,v22,byteswap)
297 ori r2,r2,0
298
299 vxor v7,v7,v15
300 VPMSUMD(v15,v23,const1)
301 lvx v23,off112,r4
302 VPERM(v23,v23,v23,byteswap)
303
304 addi r4,r4,8*16
305
306 bdnz 4b
307
308.Lfirst_cool_down:
309 /* First cool down pass */
310 lvx const1,0,r3
311 addi r3,r3,16
312
313 vxor v0,v0,v8
314 VPMSUMD(v8,v16,const1)
315 ori r2,r2,0
316
317 vxor v1,v1,v9
318 VPMSUMD(v9,v17,const1)
319 ori r2,r2,0
320
321 vxor v2,v2,v10
322 VPMSUMD(v10,v18,const1)
323 ori r2,r2,0
324
325 vxor v3,v3,v11
326 VPMSUMD(v11,v19,const1)
327 ori r2,r2,0
328
329 vxor v4,v4,v12
330 VPMSUMD(v12,v20,const1)
331 ori r2,r2,0
332
333 vxor v5,v5,v13
334 VPMSUMD(v13,v21,const1)
335 ori r2,r2,0
336
337 vxor v6,v6,v14
338 VPMSUMD(v14,v22,const1)
339 ori r2,r2,0
340
341 vxor v7,v7,v15
342 VPMSUMD(v15,v23,const1)
343 ori r2,r2,0
344
345.Lsecond_cool_down:
346 /* Second cool down pass */
347 vxor v0,v0,v8
348 vxor v1,v1,v9
349 vxor v2,v2,v10
350 vxor v3,v3,v11
351 vxor v4,v4,v12
352 vxor v5,v5,v13
353 vxor v6,v6,v14
354 vxor v7,v7,v15
355
356#ifdef REFLECT
357 /*
358 * vpmsumd produces a 96 bit result in the least significant bits
359 * of the register. Since we are bit reflected we have to shift it
360 * left 32 bits so it occupies the least significant bits in the
361 * bit reflected domain.
362 */
363 vsldoi v0,v0,zeroes,4
364 vsldoi v1,v1,zeroes,4
365 vsldoi v2,v2,zeroes,4
366 vsldoi v3,v3,zeroes,4
367 vsldoi v4,v4,zeroes,4
368 vsldoi v5,v5,zeroes,4
369 vsldoi v6,v6,zeroes,4
370 vsldoi v7,v7,zeroes,4
371#endif
372
373 /* xor with last 1024 bits */
374 lvx v8,0,r4
375 lvx v9,off16,r4
376 VPERM(v8,v8,v8,byteswap)
377 VPERM(v9,v9,v9,byteswap)
378 lvx v10,off32,r4
379 lvx v11,off48,r4
380 VPERM(v10,v10,v10,byteswap)
381 VPERM(v11,v11,v11,byteswap)
382 lvx v12,off64,r4
383 lvx v13,off80,r4
384 VPERM(v12,v12,v12,byteswap)
385 VPERM(v13,v13,v13,byteswap)
386 lvx v14,off96,r4
387 lvx v15,off112,r4
388 VPERM(v14,v14,v14,byteswap)
389 VPERM(v15,v15,v15,byteswap)
390
391 addi r4,r4,8*16
392
393 vxor v16,v0,v8
394 vxor v17,v1,v9
395 vxor v18,v2,v10
396 vxor v19,v3,v11
397 vxor v20,v4,v12
398 vxor v21,v5,v13
399 vxor v22,v6,v14
400 vxor v23,v7,v15
401
402 li r0,1
403 cmpdi r6,0
404 addi r6,r6,128
405 bne 1b
406
407 /* Work out how many bytes we have left */
408 andi. r5,r5,127
409
410 /* Calculate where in the constant table we need to start */
411 subfic r6,r5,128
412 add r3,r3,r6
413
414 /* How many 16 byte chunks are in the tail */
415 srdi r7,r5,4
416 mtctr r7
417
418 /*
419 * Reduce the previously calculated 1024 bits to 64 bits, shifting
420 * 32 bits to include the trailing 32 bits of zeros
421 */
422 lvx v0,0,r3
423 lvx v1,off16,r3
424 lvx v2,off32,r3
425 lvx v3,off48,r3
426 lvx v4,off64,r3
427 lvx v5,off80,r3
428 lvx v6,off96,r3
429 lvx v7,off112,r3
430 addi r3,r3,8*16
431
432 VPMSUMW(v0,v16,v0)
433 VPMSUMW(v1,v17,v1)
434 VPMSUMW(v2,v18,v2)
435 VPMSUMW(v3,v19,v3)
436 VPMSUMW(v4,v20,v4)
437 VPMSUMW(v5,v21,v5)
438 VPMSUMW(v6,v22,v6)
439 VPMSUMW(v7,v23,v7)
440
441 /* Now reduce the tail (0 - 112 bytes) */
442 cmpdi r7,0
443 beq 1f
444
445 lvx v16,0,r4
446 lvx v17,0,r3
447 VPERM(v16,v16,v16,byteswap)
448 VPMSUMW(v16,v16,v17)
449 vxor v0,v0,v16
450 bdz 1f
451
452 lvx v16,off16,r4
453 lvx v17,off16,r3
454 VPERM(v16,v16,v16,byteswap)
455 VPMSUMW(v16,v16,v17)
456 vxor v0,v0,v16
457 bdz 1f
458
459 lvx v16,off32,r4
460 lvx v17,off32,r3
461 VPERM(v16,v16,v16,byteswap)
462 VPMSUMW(v16,v16,v17)
463 vxor v0,v0,v16
464 bdz 1f
465
466 lvx v16,off48,r4
467 lvx v17,off48,r3
468 VPERM(v16,v16,v16,byteswap)
469 VPMSUMW(v16,v16,v17)
470 vxor v0,v0,v16
471 bdz 1f
472
473 lvx v16,off64,r4
474 lvx v17,off64,r3
475 VPERM(v16,v16,v16,byteswap)
476 VPMSUMW(v16,v16,v17)
477 vxor v0,v0,v16
478 bdz 1f
479
480 lvx v16,off80,r4
481 lvx v17,off80,r3
482 VPERM(v16,v16,v16,byteswap)
483 VPMSUMW(v16,v16,v17)
484 vxor v0,v0,v16
485 bdz 1f
486
487 lvx v16,off96,r4
488 lvx v17,off96,r3
489 VPERM(v16,v16,v16,byteswap)
490 VPMSUMW(v16,v16,v17)
491 vxor v0,v0,v16
492
493 /* Now xor all the parallel chunks together */
4941: vxor v0,v0,v1
495 vxor v2,v2,v3
496 vxor v4,v4,v5
497 vxor v6,v6,v7
498
499 vxor v0,v0,v2
500 vxor v4,v4,v6
501
502 vxor v0,v0,v4
503
504.Lbarrett_reduction:
505 /* Barrett constants */
506 LOAD_REG_ADDR(r3, .barrett_constants)
507
508 lvx const1,0,r3
509 lvx const2,off16,r3
510
511 vsldoi v1,v0,v0,8
512 vxor v0,v0,v1 /* xor two 64 bit results together */
513
514#ifdef REFLECT
515 /* shift left one bit */
516 vspltisb v1,1
517 vsl v0,v0,v1
518#endif
519
520 vand v0,v0,mask_64bit
521#ifndef REFLECT
522 /*
523 * Now for the Barrett reduction algorithm. The idea is to calculate q,
524 * the multiple of our polynomial that we need to subtract. By
525 * doing the computation 2x bits higher (ie 64 bits) and shifting the
526 * result back down 2x bits, we round down to the nearest multiple.
527 */
528 VPMSUMD(v1,v0,const1) /* ma */
529 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
530 VPMSUMD(v1,v1,const2) /* qn */
531 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
532
533 /*
534 * Get the result into r3. We need to shift it left 8 bytes:
535 * V0 [ 0 1 2 X ]
536 * V0 [ 0 X 2 3 ]
537 */
538 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
539#else
540 /*
541 * The reflected version of Barrett reduction. Instead of bit
542 * reflecting our data (which is expensive to do), we bit reflect our
543 * constants and our algorithm, which means the intermediate data in
544 * our vector registers goes from 0-63 instead of 63-0. We can reflect
545 * the algorithm because we don't carry in mod 2 arithmetic.
546 */
547 vand v1,v0,mask_32bit /* bottom 32 bits of a */
548 VPMSUMD(v1,v1,const1) /* ma */
549 vand v1,v1,mask_32bit /* bottom 32bits of ma */
550 VPMSUMD(v1,v1,const2) /* qn */
551 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
552
553 /*
554 * Since we are bit reflected, the result (ie the low 32 bits) is in
555 * the high 32 bits. We just need to shift it left 4 bytes
556 * V0 [ 0 1 X 3 ]
557 * V0 [ 0 X 2 3 ]
558 */
559 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
560#endif
561
562 /* Get it into r3 */
563 MFVRD(R3, v0)
564
565.Lout:
566 subi r6,r1,56+10*16
567 subi r7,r1,56+2*16
568
569 lvx v20,0,r6
570 lvx v21,off16,r6
571 lvx v22,off32,r6
572 lvx v23,off48,r6
573 lvx v24,off64,r6
574 lvx v25,off80,r6
575 lvx v26,off96,r6
576 lvx v27,off112,r6
577 lvx v28,0,r7
578 lvx v29,off16,r7
579
580 ld r31,-8(r1)
581 ld r30,-16(r1)
582 ld r29,-24(r1)
583 ld r28,-32(r1)
584 ld r27,-40(r1)
585 ld r26,-48(r1)
586 ld r25,-56(r1)
587
588 blr
589
590.Lfirst_warm_up_done:
591 lvx const1,0,r3
592 addi r3,r3,16
593
594 VPMSUMD(v8,v16,const1)
595 VPMSUMD(v9,v17,const1)
596 VPMSUMD(v10,v18,const1)
597 VPMSUMD(v11,v19,const1)
598 VPMSUMD(v12,v20,const1)
599 VPMSUMD(v13,v21,const1)
600 VPMSUMD(v14,v22,const1)
601 VPMSUMD(v15,v23,const1)
602
603 b .Lsecond_cool_down
604
605.Lshort:
606 cmpdi r5,0
607 beq .Lzero
608
609 LOAD_REG_ADDR(r3, .short_constants)
610
611 /* Calculate where in the constant table we need to start */
612 subfic r6,r5,256
613 add r3,r3,r6
614
615 /* How many 16 byte chunks? */
616 srdi r7,r5,4
617 mtctr r7
618
619 vxor v19,v19,v19
620 vxor v20,v20,v20
621
622 lvx v0,0,r4
623 lvx v16,0,r3
624 VPERM(v0,v0,v16,byteswap)
625 vxor v0,v0,v8 /* xor in initial value */
626 VPMSUMW(v0,v0,v16)
627 bdz .Lv0
628
629 lvx v1,off16,r4
630 lvx v17,off16,r3
631 VPERM(v1,v1,v17,byteswap)
632 VPMSUMW(v1,v1,v17)
633 bdz .Lv1
634
635 lvx v2,off32,r4
636 lvx v16,off32,r3
637 VPERM(v2,v2,v16,byteswap)
638 VPMSUMW(v2,v2,v16)
639 bdz .Lv2
640
641 lvx v3,off48,r4
642 lvx v17,off48,r3
643 VPERM(v3,v3,v17,byteswap)
644 VPMSUMW(v3,v3,v17)
645 bdz .Lv3
646
647 lvx v4,off64,r4
648 lvx v16,off64,r3
649 VPERM(v4,v4,v16,byteswap)
650 VPMSUMW(v4,v4,v16)
651 bdz .Lv4
652
653 lvx v5,off80,r4
654 lvx v17,off80,r3
655 VPERM(v5,v5,v17,byteswap)
656 VPMSUMW(v5,v5,v17)
657 bdz .Lv5
658
659 lvx v6,off96,r4
660 lvx v16,off96,r3
661 VPERM(v6,v6,v16,byteswap)
662 VPMSUMW(v6,v6,v16)
663 bdz .Lv6
664
665 lvx v7,off112,r4
666 lvx v17,off112,r3
667 VPERM(v7,v7,v17,byteswap)
668 VPMSUMW(v7,v7,v17)
669 bdz .Lv7
670
671 addi r3,r3,128
672 addi r4,r4,128
673
674 lvx v8,0,r4
675 lvx v16,0,r3
676 VPERM(v8,v8,v16,byteswap)
677 VPMSUMW(v8,v8,v16)
678 bdz .Lv8
679
680 lvx v9,off16,r4
681 lvx v17,off16,r3
682 VPERM(v9,v9,v17,byteswap)
683 VPMSUMW(v9,v9,v17)
684 bdz .Lv9
685
686 lvx v10,off32,r4
687 lvx v16,off32,r3
688 VPERM(v10,v10,v16,byteswap)
689 VPMSUMW(v10,v10,v16)
690 bdz .Lv10
691
692 lvx v11,off48,r4
693 lvx v17,off48,r3
694 VPERM(v11,v11,v17,byteswap)
695 VPMSUMW(v11,v11,v17)
696 bdz .Lv11
697
698 lvx v12,off64,r4
699 lvx v16,off64,r3
700 VPERM(v12,v12,v16,byteswap)
701 VPMSUMW(v12,v12,v16)
702 bdz .Lv12
703
704 lvx v13,off80,r4
705 lvx v17,off80,r3
706 VPERM(v13,v13,v17,byteswap)
707 VPMSUMW(v13,v13,v17)
708 bdz .Lv13
709
710 lvx v14,off96,r4
711 lvx v16,off96,r3
712 VPERM(v14,v14,v16,byteswap)
713 VPMSUMW(v14,v14,v16)
714 bdz .Lv14
715
716 lvx v15,off112,r4
717 lvx v17,off112,r3
718 VPERM(v15,v15,v17,byteswap)
719 VPMSUMW(v15,v15,v17)
720
721.Lv15: vxor v19,v19,v15
722.Lv14: vxor v20,v20,v14
723.Lv13: vxor v19,v19,v13
724.Lv12: vxor v20,v20,v12
725.Lv11: vxor v19,v19,v11
726.Lv10: vxor v20,v20,v10
727.Lv9: vxor v19,v19,v9
728.Lv8: vxor v20,v20,v8
729.Lv7: vxor v19,v19,v7
730.Lv6: vxor v20,v20,v6
731.Lv5: vxor v19,v19,v5
732.Lv4: vxor v20,v20,v4
733.Lv3: vxor v19,v19,v3
734.Lv2: vxor v20,v20,v2
735.Lv1: vxor v19,v19,v1
736.Lv0: vxor v20,v20,v0
737
738 vxor v0,v19,v20
739
740 b .Lbarrett_reduction
741
742.Lzero:
743 mr r3,r10
744 b .Lout
745
746FUNC_END(CRC_FUNCTION_NAME)
747

source code of linux/arch/powerpc/crypto/crc32-vpmsum_core.S