1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * Core of the accelerated CRC algorithm. |
4 | * In your file, define the constants and CRC_FUNCTION_NAME |
5 | * Then include this file. |
6 | * |
7 | * Calculate the checksum of data that is 16 byte aligned and a multiple of |
8 | * 16 bytes. |
9 | * |
10 | * The first step is to reduce it to 1024 bits. We do this in 8 parallel |
11 | * chunks in order to mask the latency of the vpmsum instructions. If we |
12 | * have more than 32 kB of data to checksum we repeat this step multiple |
13 | * times, passing in the previous 1024 bits. |
14 | * |
15 | * The next step is to reduce the 1024 bits to 64 bits. This step adds |
16 | * 32 bits of 0s to the end - this matches what a CRC does. We just |
17 | * calculate constants that land the data in this 32 bits. |
18 | * |
19 | * We then use fixed point Barrett reduction to compute a mod n over GF(2) |
20 | * for n = CRC using POWER8 instructions. We use x = 32. |
21 | * |
22 | * https://en.wikipedia.org/wiki/Barrett_reduction |
23 | * |
24 | * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM |
25 | */ |
26 | |
27 | #include <asm/ppc_asm.h> |
28 | #include <asm/ppc-opcode.h> |
29 | |
30 | #define MAX_SIZE 32768 |
31 | |
32 | .text |
33 | |
34 | #if defined(__BIG_ENDIAN__) && defined(REFLECT) |
35 | #define BYTESWAP_DATA |
36 | #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) |
37 | #define BYTESWAP_DATA |
38 | #else |
39 | #undef BYTESWAP_DATA |
40 | #endif |
41 | |
42 | #define off16 r25 |
43 | #define off32 r26 |
44 | #define off48 r27 |
45 | #define off64 r28 |
46 | #define off80 r29 |
47 | #define off96 r30 |
48 | #define off112 r31 |
49 | |
50 | #define const1 v24 |
51 | #define const2 v25 |
52 | |
53 | #define byteswap v26 |
54 | #define mask_32bit v27 |
55 | #define mask_64bit v28 |
56 | #define zeroes v29 |
57 | |
58 | #ifdef BYTESWAP_DATA |
59 | #define VPERM(A, B, C, D) vperm A, B, C, D |
60 | #else |
61 | #define VPERM(A, B, C, D) |
62 | #endif |
63 | |
64 | /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ |
65 | FUNC_START(CRC_FUNCTION_NAME) |
66 | std r31,-8(r1) |
67 | std r30,-16(r1) |
68 | std r29,-24(r1) |
69 | std r28,-32(r1) |
70 | std r27,-40(r1) |
71 | std r26,-48(r1) |
72 | std r25,-56(r1) |
73 | |
74 | li off16,16 |
75 | li off32,32 |
76 | li off48,48 |
77 | li off64,64 |
78 | li off80,80 |
79 | li off96,96 |
80 | li off112,112 |
81 | li r0,0 |
82 | |
83 | /* Enough room for saving 10 non volatile VMX registers */ |
84 | subi r6,r1,56+10*16 |
85 | subi r7,r1,56+2*16 |
86 | |
87 | stvx v20,0,r6 |
88 | stvx v21,off16,r6 |
89 | stvx v22,off32,r6 |
90 | stvx v23,off48,r6 |
91 | stvx v24,off64,r6 |
92 | stvx v25,off80,r6 |
93 | stvx v26,off96,r6 |
94 | stvx v27,off112,r6 |
95 | stvx v28,0,r7 |
96 | stvx v29,off16,r7 |
97 | |
98 | mr r10,r3 |
99 | |
100 | vxor zeroes,zeroes,zeroes |
101 | vspltisw v0,-1 |
102 | |
103 | vsldoi mask_32bit,zeroes,v0,4 |
104 | vsldoi mask_64bit,zeroes,v0,8 |
105 | |
106 | /* Get the initial value into v8 */ |
107 | vxor v8,v8,v8 |
108 | MTVRD(v8, R3) |
109 | #ifdef REFLECT |
110 | vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ |
111 | #else |
112 | vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ |
113 | #endif |
114 | |
115 | #ifdef BYTESWAP_DATA |
116 | LOAD_REG_ADDR(r3, .byteswap_constant) |
117 | lvx byteswap,0,r3 |
118 | addi r3,r3,16 |
119 | #endif |
120 | |
121 | cmpdi r5,256 |
122 | blt .Lshort |
123 | |
124 | rldicr r6,r5,0,56 |
125 | |
126 | /* Checksum in blocks of MAX_SIZE */ |
127 | 1: lis r7,MAX_SIZE@h |
128 | ori r7,r7,MAX_SIZE@l |
129 | mr r9,r7 |
130 | cmpd r6,r7 |
131 | bgt 2f |
132 | mr r7,r6 |
133 | 2: subf r6,r7,r6 |
134 | |
135 | /* our main loop does 128 bytes at a time */ |
136 | srdi r7,r7,7 |
137 | |
138 | /* |
139 | * Work out the offset into the constants table to start at. Each |
140 | * constant is 16 bytes, and it is used against 128 bytes of input |
141 | * data - 128 / 16 = 8 |
142 | */ |
143 | sldi r8,r7,4 |
144 | srdi r9,r9,3 |
145 | subf r8,r8,r9 |
146 | |
147 | /* We reduce our final 128 bytes in a separate step */ |
148 | addi r7,r7,-1 |
149 | mtctr r7 |
150 | |
151 | LOAD_REG_ADDR(r3, .constants) |
152 | |
153 | /* Find the start of our constants */ |
154 | add r3,r3,r8 |
155 | |
156 | /* zero v0-v7 which will contain our checksums */ |
157 | vxor v0,v0,v0 |
158 | vxor v1,v1,v1 |
159 | vxor v2,v2,v2 |
160 | vxor v3,v3,v3 |
161 | vxor v4,v4,v4 |
162 | vxor v5,v5,v5 |
163 | vxor v6,v6,v6 |
164 | vxor v7,v7,v7 |
165 | |
166 | lvx const1,0,r3 |
167 | |
168 | /* |
169 | * If we are looping back to consume more data we use the values |
170 | * already in v16-v23. |
171 | */ |
172 | cmpdi r0,1 |
173 | beq 2f |
174 | |
175 | /* First warm up pass */ |
176 | lvx v16,0,r4 |
177 | lvx v17,off16,r4 |
178 | VPERM(v16,v16,v16,byteswap) |
179 | VPERM(v17,v17,v17,byteswap) |
180 | lvx v18,off32,r4 |
181 | lvx v19,off48,r4 |
182 | VPERM(v18,v18,v18,byteswap) |
183 | VPERM(v19,v19,v19,byteswap) |
184 | lvx v20,off64,r4 |
185 | lvx v21,off80,r4 |
186 | VPERM(v20,v20,v20,byteswap) |
187 | VPERM(v21,v21,v21,byteswap) |
188 | lvx v22,off96,r4 |
189 | lvx v23,off112,r4 |
190 | VPERM(v22,v22,v22,byteswap) |
191 | VPERM(v23,v23,v23,byteswap) |
192 | addi r4,r4,8*16 |
193 | |
194 | /* xor in initial value */ |
195 | vxor v16,v16,v8 |
196 | |
197 | 2: bdz .Lfirst_warm_up_done |
198 | |
199 | addi r3,r3,16 |
200 | lvx const2,0,r3 |
201 | |
202 | /* Second warm up pass */ |
203 | VPMSUMD(v8,v16,const1) |
204 | lvx v16,0,r4 |
205 | VPERM(v16,v16,v16,byteswap) |
206 | ori r2,r2,0 |
207 | |
208 | VPMSUMD(v9,v17,const1) |
209 | lvx v17,off16,r4 |
210 | VPERM(v17,v17,v17,byteswap) |
211 | ori r2,r2,0 |
212 | |
213 | VPMSUMD(v10,v18,const1) |
214 | lvx v18,off32,r4 |
215 | VPERM(v18,v18,v18,byteswap) |
216 | ori r2,r2,0 |
217 | |
218 | VPMSUMD(v11,v19,const1) |
219 | lvx v19,off48,r4 |
220 | VPERM(v19,v19,v19,byteswap) |
221 | ori r2,r2,0 |
222 | |
223 | VPMSUMD(v12,v20,const1) |
224 | lvx v20,off64,r4 |
225 | VPERM(v20,v20,v20,byteswap) |
226 | ori r2,r2,0 |
227 | |
228 | VPMSUMD(v13,v21,const1) |
229 | lvx v21,off80,r4 |
230 | VPERM(v21,v21,v21,byteswap) |
231 | ori r2,r2,0 |
232 | |
233 | VPMSUMD(v14,v22,const1) |
234 | lvx v22,off96,r4 |
235 | VPERM(v22,v22,v22,byteswap) |
236 | ori r2,r2,0 |
237 | |
238 | VPMSUMD(v15,v23,const1) |
239 | lvx v23,off112,r4 |
240 | VPERM(v23,v23,v23,byteswap) |
241 | |
242 | addi r4,r4,8*16 |
243 | |
244 | bdz .Lfirst_cool_down |
245 | |
246 | /* |
247 | * main loop. We modulo schedule it such that it takes three iterations |
248 | * to complete - first iteration load, second iteration vpmsum, third |
249 | * iteration xor. |
250 | */ |
251 | .balign 16 |
252 | 4: lvx const1,0,r3 |
253 | addi r3,r3,16 |
254 | ori r2,r2,0 |
255 | |
256 | vxor v0,v0,v8 |
257 | VPMSUMD(v8,v16,const2) |
258 | lvx v16,0,r4 |
259 | VPERM(v16,v16,v16,byteswap) |
260 | ori r2,r2,0 |
261 | |
262 | vxor v1,v1,v9 |
263 | VPMSUMD(v9,v17,const2) |
264 | lvx v17,off16,r4 |
265 | VPERM(v17,v17,v17,byteswap) |
266 | ori r2,r2,0 |
267 | |
268 | vxor v2,v2,v10 |
269 | VPMSUMD(v10,v18,const2) |
270 | lvx v18,off32,r4 |
271 | VPERM(v18,v18,v18,byteswap) |
272 | ori r2,r2,0 |
273 | |
274 | vxor v3,v3,v11 |
275 | VPMSUMD(v11,v19,const2) |
276 | lvx v19,off48,r4 |
277 | VPERM(v19,v19,v19,byteswap) |
278 | lvx const2,0,r3 |
279 | ori r2,r2,0 |
280 | |
281 | vxor v4,v4,v12 |
282 | VPMSUMD(v12,v20,const1) |
283 | lvx v20,off64,r4 |
284 | VPERM(v20,v20,v20,byteswap) |
285 | ori r2,r2,0 |
286 | |
287 | vxor v5,v5,v13 |
288 | VPMSUMD(v13,v21,const1) |
289 | lvx v21,off80,r4 |
290 | VPERM(v21,v21,v21,byteswap) |
291 | ori r2,r2,0 |
292 | |
293 | vxor v6,v6,v14 |
294 | VPMSUMD(v14,v22,const1) |
295 | lvx v22,off96,r4 |
296 | VPERM(v22,v22,v22,byteswap) |
297 | ori r2,r2,0 |
298 | |
299 | vxor v7,v7,v15 |
300 | VPMSUMD(v15,v23,const1) |
301 | lvx v23,off112,r4 |
302 | VPERM(v23,v23,v23,byteswap) |
303 | |
304 | addi r4,r4,8*16 |
305 | |
306 | bdnz 4b |
307 | |
308 | .Lfirst_cool_down: |
309 | /* First cool down pass */ |
310 | lvx const1,0,r3 |
311 | addi r3,r3,16 |
312 | |
313 | vxor v0,v0,v8 |
314 | VPMSUMD(v8,v16,const1) |
315 | ori r2,r2,0 |
316 | |
317 | vxor v1,v1,v9 |
318 | VPMSUMD(v9,v17,const1) |
319 | ori r2,r2,0 |
320 | |
321 | vxor v2,v2,v10 |
322 | VPMSUMD(v10,v18,const1) |
323 | ori r2,r2,0 |
324 | |
325 | vxor v3,v3,v11 |
326 | VPMSUMD(v11,v19,const1) |
327 | ori r2,r2,0 |
328 | |
329 | vxor v4,v4,v12 |
330 | VPMSUMD(v12,v20,const1) |
331 | ori r2,r2,0 |
332 | |
333 | vxor v5,v5,v13 |
334 | VPMSUMD(v13,v21,const1) |
335 | ori r2,r2,0 |
336 | |
337 | vxor v6,v6,v14 |
338 | VPMSUMD(v14,v22,const1) |
339 | ori r2,r2,0 |
340 | |
341 | vxor v7,v7,v15 |
342 | VPMSUMD(v15,v23,const1) |
343 | ori r2,r2,0 |
344 | |
345 | .Lsecond_cool_down: |
346 | /* Second cool down pass */ |
347 | vxor v0,v0,v8 |
348 | vxor v1,v1,v9 |
349 | vxor v2,v2,v10 |
350 | vxor v3,v3,v11 |
351 | vxor v4,v4,v12 |
352 | vxor v5,v5,v13 |
353 | vxor v6,v6,v14 |
354 | vxor v7,v7,v15 |
355 | |
356 | #ifdef REFLECT |
357 | /* |
358 | * vpmsumd produces a 96 bit result in the least significant bits |
359 | * of the register. Since we are bit reflected we have to shift it |
360 | * left 32 bits so it occupies the least significant bits in the |
361 | * bit reflected domain. |
362 | */ |
363 | vsldoi v0,v0,zeroes,4 |
364 | vsldoi v1,v1,zeroes,4 |
365 | vsldoi v2,v2,zeroes,4 |
366 | vsldoi v3,v3,zeroes,4 |
367 | vsldoi v4,v4,zeroes,4 |
368 | vsldoi v5,v5,zeroes,4 |
369 | vsldoi v6,v6,zeroes,4 |
370 | vsldoi v7,v7,zeroes,4 |
371 | #endif |
372 | |
373 | /* xor with last 1024 bits */ |
374 | lvx v8,0,r4 |
375 | lvx v9,off16,r4 |
376 | VPERM(v8,v8,v8,byteswap) |
377 | VPERM(v9,v9,v9,byteswap) |
378 | lvx v10,off32,r4 |
379 | lvx v11,off48,r4 |
380 | VPERM(v10,v10,v10,byteswap) |
381 | VPERM(v11,v11,v11,byteswap) |
382 | lvx v12,off64,r4 |
383 | lvx v13,off80,r4 |
384 | VPERM(v12,v12,v12,byteswap) |
385 | VPERM(v13,v13,v13,byteswap) |
386 | lvx v14,off96,r4 |
387 | lvx v15,off112,r4 |
388 | VPERM(v14,v14,v14,byteswap) |
389 | VPERM(v15,v15,v15,byteswap) |
390 | |
391 | addi r4,r4,8*16 |
392 | |
393 | vxor v16,v0,v8 |
394 | vxor v17,v1,v9 |
395 | vxor v18,v2,v10 |
396 | vxor v19,v3,v11 |
397 | vxor v20,v4,v12 |
398 | vxor v21,v5,v13 |
399 | vxor v22,v6,v14 |
400 | vxor v23,v7,v15 |
401 | |
402 | li r0,1 |
403 | cmpdi r6,0 |
404 | addi r6,r6,128 |
405 | bne 1b |
406 | |
407 | /* Work out how many bytes we have left */ |
408 | andi. r5,r5,127 |
409 | |
410 | /* Calculate where in the constant table we need to start */ |
411 | subfic r6,r5,128 |
412 | add r3,r3,r6 |
413 | |
414 | /* How many 16 byte chunks are in the tail */ |
415 | srdi r7,r5,4 |
416 | mtctr r7 |
417 | |
418 | /* |
419 | * Reduce the previously calculated 1024 bits to 64 bits, shifting |
420 | * 32 bits to include the trailing 32 bits of zeros |
421 | */ |
422 | lvx v0,0,r3 |
423 | lvx v1,off16,r3 |
424 | lvx v2,off32,r3 |
425 | lvx v3,off48,r3 |
426 | lvx v4,off64,r3 |
427 | lvx v5,off80,r3 |
428 | lvx v6,off96,r3 |
429 | lvx v7,off112,r3 |
430 | addi r3,r3,8*16 |
431 | |
432 | VPMSUMW(v0,v16,v0) |
433 | VPMSUMW(v1,v17,v1) |
434 | VPMSUMW(v2,v18,v2) |
435 | VPMSUMW(v3,v19,v3) |
436 | VPMSUMW(v4,v20,v4) |
437 | VPMSUMW(v5,v21,v5) |
438 | VPMSUMW(v6,v22,v6) |
439 | VPMSUMW(v7,v23,v7) |
440 | |
441 | /* Now reduce the tail (0 - 112 bytes) */ |
442 | cmpdi r7,0 |
443 | beq 1f |
444 | |
445 | lvx v16,0,r4 |
446 | lvx v17,0,r3 |
447 | VPERM(v16,v16,v16,byteswap) |
448 | VPMSUMW(v16,v16,v17) |
449 | vxor v0,v0,v16 |
450 | bdz 1f |
451 | |
452 | lvx v16,off16,r4 |
453 | lvx v17,off16,r3 |
454 | VPERM(v16,v16,v16,byteswap) |
455 | VPMSUMW(v16,v16,v17) |
456 | vxor v0,v0,v16 |
457 | bdz 1f |
458 | |
459 | lvx v16,off32,r4 |
460 | lvx v17,off32,r3 |
461 | VPERM(v16,v16,v16,byteswap) |
462 | VPMSUMW(v16,v16,v17) |
463 | vxor v0,v0,v16 |
464 | bdz 1f |
465 | |
466 | lvx v16,off48,r4 |
467 | lvx v17,off48,r3 |
468 | VPERM(v16,v16,v16,byteswap) |
469 | VPMSUMW(v16,v16,v17) |
470 | vxor v0,v0,v16 |
471 | bdz 1f |
472 | |
473 | lvx v16,off64,r4 |
474 | lvx v17,off64,r3 |
475 | VPERM(v16,v16,v16,byteswap) |
476 | VPMSUMW(v16,v16,v17) |
477 | vxor v0,v0,v16 |
478 | bdz 1f |
479 | |
480 | lvx v16,off80,r4 |
481 | lvx v17,off80,r3 |
482 | VPERM(v16,v16,v16,byteswap) |
483 | VPMSUMW(v16,v16,v17) |
484 | vxor v0,v0,v16 |
485 | bdz 1f |
486 | |
487 | lvx v16,off96,r4 |
488 | lvx v17,off96,r3 |
489 | VPERM(v16,v16,v16,byteswap) |
490 | VPMSUMW(v16,v16,v17) |
491 | vxor v0,v0,v16 |
492 | |
493 | /* Now xor all the parallel chunks together */ |
494 | 1: vxor v0,v0,v1 |
495 | vxor v2,v2,v3 |
496 | vxor v4,v4,v5 |
497 | vxor v6,v6,v7 |
498 | |
499 | vxor v0,v0,v2 |
500 | vxor v4,v4,v6 |
501 | |
502 | vxor v0,v0,v4 |
503 | |
504 | .Lbarrett_reduction: |
505 | /* Barrett constants */ |
506 | LOAD_REG_ADDR(r3, .barrett_constants) |
507 | |
508 | lvx const1,0,r3 |
509 | lvx const2,off16,r3 |
510 | |
511 | vsldoi v1,v0,v0,8 |
512 | vxor v0,v0,v1 /* xor two 64 bit results together */ |
513 | |
514 | #ifdef REFLECT |
515 | /* shift left one bit */ |
516 | vspltisb v1,1 |
517 | vsl v0,v0,v1 |
518 | #endif |
519 | |
520 | vand v0,v0,mask_64bit |
521 | #ifndef REFLECT |
522 | /* |
523 | * Now for the Barrett reduction algorithm. The idea is to calculate q, |
524 | * the multiple of our polynomial that we need to subtract. By |
525 | * doing the computation 2x bits higher (ie 64 bits) and shifting the |
526 | * result back down 2x bits, we round down to the nearest multiple. |
527 | */ |
528 | VPMSUMD(v1,v0,const1) /* ma */ |
529 | vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ |
530 | VPMSUMD(v1,v1,const2) /* qn */ |
531 | vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ |
532 | |
533 | /* |
534 | * Get the result into r3. We need to shift it left 8 bytes: |
535 | * V0 [ 0 1 2 X ] |
536 | * V0 [ 0 X 2 3 ] |
537 | */ |
538 | vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ |
539 | #else |
540 | /* |
541 | * The reflected version of Barrett reduction. Instead of bit |
542 | * reflecting our data (which is expensive to do), we bit reflect our |
543 | * constants and our algorithm, which means the intermediate data in |
544 | * our vector registers goes from 0-63 instead of 63-0. We can reflect |
545 | * the algorithm because we don't carry in mod 2 arithmetic. |
546 | */ |
547 | vand v1,v0,mask_32bit /* bottom 32 bits of a */ |
548 | VPMSUMD(v1,v1,const1) /* ma */ |
549 | vand v1,v1,mask_32bit /* bottom 32bits of ma */ |
550 | VPMSUMD(v1,v1,const2) /* qn */ |
551 | vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ |
552 | |
553 | /* |
554 | * Since we are bit reflected, the result (ie the low 32 bits) is in |
555 | * the high 32 bits. We just need to shift it left 4 bytes |
556 | * V0 [ 0 1 X 3 ] |
557 | * V0 [ 0 X 2 3 ] |
558 | */ |
559 | vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ |
560 | #endif |
561 | |
562 | /* Get it into r3 */ |
563 | MFVRD(R3, v0) |
564 | |
565 | .Lout: |
566 | subi r6,r1,56+10*16 |
567 | subi r7,r1,56+2*16 |
568 | |
569 | lvx v20,0,r6 |
570 | lvx v21,off16,r6 |
571 | lvx v22,off32,r6 |
572 | lvx v23,off48,r6 |
573 | lvx v24,off64,r6 |
574 | lvx v25,off80,r6 |
575 | lvx v26,off96,r6 |
576 | lvx v27,off112,r6 |
577 | lvx v28,0,r7 |
578 | lvx v29,off16,r7 |
579 | |
580 | ld r31,-8(r1) |
581 | ld r30,-16(r1) |
582 | ld r29,-24(r1) |
583 | ld r28,-32(r1) |
584 | ld r27,-40(r1) |
585 | ld r26,-48(r1) |
586 | ld r25,-56(r1) |
587 | |
588 | blr |
589 | |
590 | .Lfirst_warm_up_done: |
591 | lvx const1,0,r3 |
592 | addi r3,r3,16 |
593 | |
594 | VPMSUMD(v8,v16,const1) |
595 | VPMSUMD(v9,v17,const1) |
596 | VPMSUMD(v10,v18,const1) |
597 | VPMSUMD(v11,v19,const1) |
598 | VPMSUMD(v12,v20,const1) |
599 | VPMSUMD(v13,v21,const1) |
600 | VPMSUMD(v14,v22,const1) |
601 | VPMSUMD(v15,v23,const1) |
602 | |
603 | b .Lsecond_cool_down |
604 | |
605 | .Lshort: |
606 | cmpdi r5,0 |
607 | beq .Lzero |
608 | |
609 | LOAD_REG_ADDR(r3, .short_constants) |
610 | |
611 | /* Calculate where in the constant table we need to start */ |
612 | subfic r6,r5,256 |
613 | add r3,r3,r6 |
614 | |
615 | /* How many 16 byte chunks? */ |
616 | srdi r7,r5,4 |
617 | mtctr r7 |
618 | |
619 | vxor v19,v19,v19 |
620 | vxor v20,v20,v20 |
621 | |
622 | lvx v0,0,r4 |
623 | lvx v16,0,r3 |
624 | VPERM(v0,v0,v16,byteswap) |
625 | vxor v0,v0,v8 /* xor in initial value */ |
626 | VPMSUMW(v0,v0,v16) |
627 | bdz .Lv0 |
628 | |
629 | lvx v1,off16,r4 |
630 | lvx v17,off16,r3 |
631 | VPERM(v1,v1,v17,byteswap) |
632 | VPMSUMW(v1,v1,v17) |
633 | bdz .Lv1 |
634 | |
635 | lvx v2,off32,r4 |
636 | lvx v16,off32,r3 |
637 | VPERM(v2,v2,v16,byteswap) |
638 | VPMSUMW(v2,v2,v16) |
639 | bdz .Lv2 |
640 | |
641 | lvx v3,off48,r4 |
642 | lvx v17,off48,r3 |
643 | VPERM(v3,v3,v17,byteswap) |
644 | VPMSUMW(v3,v3,v17) |
645 | bdz .Lv3 |
646 | |
647 | lvx v4,off64,r4 |
648 | lvx v16,off64,r3 |
649 | VPERM(v4,v4,v16,byteswap) |
650 | VPMSUMW(v4,v4,v16) |
651 | bdz .Lv4 |
652 | |
653 | lvx v5,off80,r4 |
654 | lvx v17,off80,r3 |
655 | VPERM(v5,v5,v17,byteswap) |
656 | VPMSUMW(v5,v5,v17) |
657 | bdz .Lv5 |
658 | |
659 | lvx v6,off96,r4 |
660 | lvx v16,off96,r3 |
661 | VPERM(v6,v6,v16,byteswap) |
662 | VPMSUMW(v6,v6,v16) |
663 | bdz .Lv6 |
664 | |
665 | lvx v7,off112,r4 |
666 | lvx v17,off112,r3 |
667 | VPERM(v7,v7,v17,byteswap) |
668 | VPMSUMW(v7,v7,v17) |
669 | bdz .Lv7 |
670 | |
671 | addi r3,r3,128 |
672 | addi r4,r4,128 |
673 | |
674 | lvx v8,0,r4 |
675 | lvx v16,0,r3 |
676 | VPERM(v8,v8,v16,byteswap) |
677 | VPMSUMW(v8,v8,v16) |
678 | bdz .Lv8 |
679 | |
680 | lvx v9,off16,r4 |
681 | lvx v17,off16,r3 |
682 | VPERM(v9,v9,v17,byteswap) |
683 | VPMSUMW(v9,v9,v17) |
684 | bdz .Lv9 |
685 | |
686 | lvx v10,off32,r4 |
687 | lvx v16,off32,r3 |
688 | VPERM(v10,v10,v16,byteswap) |
689 | VPMSUMW(v10,v10,v16) |
690 | bdz .Lv10 |
691 | |
692 | lvx v11,off48,r4 |
693 | lvx v17,off48,r3 |
694 | VPERM(v11,v11,v17,byteswap) |
695 | VPMSUMW(v11,v11,v17) |
696 | bdz .Lv11 |
697 | |
698 | lvx v12,off64,r4 |
699 | lvx v16,off64,r3 |
700 | VPERM(v12,v12,v16,byteswap) |
701 | VPMSUMW(v12,v12,v16) |
702 | bdz .Lv12 |
703 | |
704 | lvx v13,off80,r4 |
705 | lvx v17,off80,r3 |
706 | VPERM(v13,v13,v17,byteswap) |
707 | VPMSUMW(v13,v13,v17) |
708 | bdz .Lv13 |
709 | |
710 | lvx v14,off96,r4 |
711 | lvx v16,off96,r3 |
712 | VPERM(v14,v14,v16,byteswap) |
713 | VPMSUMW(v14,v14,v16) |
714 | bdz .Lv14 |
715 | |
716 | lvx v15,off112,r4 |
717 | lvx v17,off112,r3 |
718 | VPERM(v15,v15,v17,byteswap) |
719 | VPMSUMW(v15,v15,v17) |
720 | |
721 | .Lv15: vxor v19,v19,v15 |
722 | .Lv14: vxor v20,v20,v14 |
723 | .Lv13: vxor v19,v19,v13 |
724 | .Lv12: vxor v20,v20,v12 |
725 | .Lv11: vxor v19,v19,v11 |
726 | .Lv10: vxor v20,v20,v10 |
727 | .Lv9: vxor v19,v19,v9 |
728 | .Lv8: vxor v20,v20,v8 |
729 | .Lv7: vxor v19,v19,v7 |
730 | .Lv6: vxor v20,v20,v6 |
731 | .Lv5: vxor v19,v19,v5 |
732 | .Lv4: vxor v20,v20,v4 |
733 | .Lv3: vxor v19,v19,v3 |
734 | .Lv2: vxor v20,v20,v2 |
735 | .Lv1: vxor v19,v19,v1 |
736 | .Lv0: vxor v20,v20,v0 |
737 | |
738 | vxor v0,v19,v20 |
739 | |
740 | b .Lbarrett_reduction |
741 | |
742 | .Lzero: |
743 | mr r3,r10 |
744 | b .Lout |
745 | |
746 | FUNC_END(CRC_FUNCTION_NAME) |
747 | |