crc32-vpmsum_core.S source code [linux/arch/powerpc/crypto/crc32-vpmsum_core.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* Core of the accelerated CRC algorithm.
4	* In your file, define the constants and CRC_FUNCTION_NAME
5	* Then include this file.
6	*
7	* Calculate the checksum of data that is 16 byte aligned and a multiple of
8	* 16 bytes.
9	*
10	* The first step is to reduce it to 1024 bits. We do this in 8 parallel
11	* chunks in order to mask the latency of the vpmsum instructions. If we
12	* have more than 32 kB of data to checksum we repeat this step multiple
13	* times, passing in the previous 1024 bits.
14	*
15	* The next step is to reduce the 1024 bits to 64 bits. This step adds
16	* 32 bits of 0s to the end - this matches what a CRC does. We just
17	* calculate constants that land the data in this 32 bits.
18	*
19	* We then use fixed point Barrett reduction to compute a mod n over GF(2)
20	* for n = CRC using POWER8 instructions. We use x = 32.
21	*
22	* https://en.wikipedia.org/wiki/Barrett_reduction
23	*
24	* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
25	*/
26
27	#include <asm/ppc_asm.h>
28	#include <asm/ppc-opcode.h>
29
30	#define MAX_SIZE 32768
31
32	.text
33
34	#if defined(__BIG_ENDIAN__) && defined(REFLECT)
35	#define BYTESWAP_DATA
36	#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37	#define BYTESWAP_DATA
38	#else
39	#undef BYTESWAP_DATA
40	#endif
41
42	#define off16 r25
43	#define off32 r26
44	#define off48 r27
45	#define off64 r28
46	#define off80 r29
47	#define off96 r30
48	#define off112 r31
49
50	#define const1 v24
51	#define const2 v25
52
53	#define byteswap v26
54	#define mask_32bit v27
55	#define mask_64bit v28
56	#define zeroes v29
57
58	#ifdef BYTESWAP_DATA
59	#define VPERM(A, B, C, D) vperm A, B, C, D
60	#else
61	#define VPERM(A, B, C, D)
62	#endif
63
64	/ unsigned int CRC_FUNCTION_NAME(unsigned int crc, void p, unsigned long len) /*
65	FUNC_START(CRC_FUNCTION_NAME)
66	std r31,-`8`(r1)
67	std r30,-`16`(r1)
68	std r29,-`24`(r1)
69	std r28,-`32`(r1)
70	std r27,-`40`(r1)
71	std r26,-`48`(r1)
72	std r25,-`56`(r1)
73
74	li off16,`16`
75	li off32,`32`
76	li off48,`48`
77	li off64,`64`
78	li off80,`80`
79	li off96,`96`
80	li off112,`112`
81	li r0,`0`
82
83	/ Enough room for saving 10 non volatile VMX registers /
84	subi r6,r1,`56`+`10`*`16`
85	subi r7,r1,`56`+`2`*`16`
86
87	stvx v20,`0`,r6
88	stvx v21,off16,r6
89	stvx v22,off32,r6
90	stvx v23,off48,r6
91	stvx v24,off64,r6
92	stvx v25,off80,r6
93	stvx v26,off96,r6
94	stvx v27,off112,r6
95	stvx v28,`0`,r7
96	stvx v29,off16,r7
97
98	mr r10,r3
99
100	vxor zeroes,zeroes,zeroes
101	vspltisw v0,-`1`
102
103	vsldoi mask_32bit,zeroes,v0,`4`
104	vsldoi mask_64bit,zeroes,v0,`8`
105
106	/ Get the initial value into v8 /
107	vxor v8,v8,v8
108	MTVRD(v8, R3)
109	#ifdef REFLECT
110	vsldoi v8,zeroes,v8,`8` / shift into bottom 32 bits /
111	#else
112	vsldoi v8,v8,zeroes,`4` / shift into top 32 bits /
113	#endif
114
115	#ifdef BYTESWAP_DATA
116	LOAD_REG_ADDR(r3, .byteswap_constant)
117	lvx byteswap,`0`,r3
118	addi r3,r3,`16`
119	#endif
120
121	cmpdi r5,`256`
122	blt .Lshort
123
124	rldicr r6,r5,`0`,`56`
125
126	/ Checksum in blocks of MAX_SIZE /
127	`1`: lis r7,MAX_SIZE@h
128	ori r7,r7,MAX_SIZE@l
129	mr r9,r7
130	cmpd r6,r7
131	bgt `2f`
132	mr r7,r6
133	`2`: subf r6,r7,r6
134
135	/ our main loop does 128 bytes at a time /
136	srdi r7,r7,`7`
137
138	/*
139	* Work out the offset into the constants table to start at. Each
140	* constant is 16 bytes, and it is used against 128 bytes of input
141	* data - 128 / 16 = 8
142	*/
143	sldi r8,r7,`4`
144	srdi r9,r9,`3`
145	subf r8,r8,r9
146
147	/ We reduce our final 128 bytes in a separate step /
148	addi r7,r7,-`1`
149	mtctr r7
150
151	LOAD_REG_ADDR(r3, .constants)
152
153	/ Find the start of our constants /
154	add r3,r3,r8
155
156	/ zero v0-v7 which will contain our checksums /
157	vxor v0,v0,v0
158	vxor v1,v1,v1
159	vxor v2,v2,v2
160	vxor v3,v3,v3
161	vxor v4,v4,v4
162	vxor v5,v5,v5
163	vxor v6,v6,v6
164	vxor v7,v7,v7
165
166	lvx const1,`0`,r3
167
168	/*
169	* If we are looping back to consume more data we use the values
170	* already in v16-v23.
171	*/
172	cmpdi r0,`1`
173	beq `2f`
174
175	/ First warm up pass /
176	lvx v16,`0`,r4
177	lvx v17,off16,r4
178	VPERM(v16,v16,v16,byteswap)
179	VPERM(v17,v17,v17,byteswap)
180	lvx v18,off32,r4
181	lvx v19,off48,r4
182	VPERM(v18,v18,v18,byteswap)
183	VPERM(v19,v19,v19,byteswap)
184	lvx v20,off64,r4
185	lvx v21,off80,r4
186	VPERM(v20,v20,v20,byteswap)
187	VPERM(v21,v21,v21,byteswap)
188	lvx v22,off96,r4
189	lvx v23,off112,r4
190	VPERM(v22,v22,v22,byteswap)
191	VPERM(v23,v23,v23,byteswap)
192	addi r4,r4,`8`*`16`
193
194	/ xor in initial value /
195	vxor v16,v16,v8
196
197	`2`: bdz .Lfirst_warm_up_done
198
199	addi r3,r3,`16`
200	lvx const2,`0`,r3
201
202	/ Second warm up pass /
203	VPMSUMD(v8,v16,const1)
204	lvx v16,`0`,r4
205	VPERM(v16,v16,v16,byteswap)
206	ori r2,r2,`0`
207
208	VPMSUMD(v9,v17,const1)
209	lvx v17,off16,r4
210	VPERM(v17,v17,v17,byteswap)
211	ori r2,r2,`0`
212
213	VPMSUMD(v10,v18,const1)
214	lvx v18,off32,r4
215	VPERM(v18,v18,v18,byteswap)
216	ori r2,r2,`0`
217
218	VPMSUMD(v11,v19,const1)
219	lvx v19,off48,r4
220	VPERM(v19,v19,v19,byteswap)
221	ori r2,r2,`0`
222
223	VPMSUMD(v12,v20,const1)
224	lvx v20,off64,r4
225	VPERM(v20,v20,v20,byteswap)
226	ori r2,r2,`0`
227
228	VPMSUMD(v13,v21,const1)
229	lvx v21,off80,r4
230	VPERM(v21,v21,v21,byteswap)
231	ori r2,r2,`0`
232
233	VPMSUMD(v14,v22,const1)
234	lvx v22,off96,r4
235	VPERM(v22,v22,v22,byteswap)
236	ori r2,r2,`0`
237
238	VPMSUMD(v15,v23,const1)
239	lvx v23,off112,r4
240	VPERM(v23,v23,v23,byteswap)
241
242	addi r4,r4,`8`*`16`
243
244	bdz .Lfirst_cool_down
245
246	/*
247	* main loop. We modulo schedule it such that it takes three iterations
248	* to complete - first iteration load, second iteration vpmsum, third
249	* iteration xor.
250	*/
251	.balign `16`
252	`4`: lvx const1,`0`,r3
253	addi r3,r3,`16`
254	ori r2,r2,`0`
255
256	vxor v0,v0,v8
257	VPMSUMD(v8,v16,const2)
258	lvx v16,`0`,r4
259	VPERM(v16,v16,v16,byteswap)
260	ori r2,r2,`0`
261
262	vxor v1,v1,v9
263	VPMSUMD(v9,v17,const2)
264	lvx v17,off16,r4
265	VPERM(v17,v17,v17,byteswap)
266	ori r2,r2,`0`
267
268	vxor v2,v2,v10
269	VPMSUMD(v10,v18,const2)
270	lvx v18,off32,r4
271	VPERM(v18,v18,v18,byteswap)
272	ori r2,r2,`0`
273
274	vxor v3,v3,v11
275	VPMSUMD(v11,v19,const2)
276	lvx v19,off48,r4
277	VPERM(v19,v19,v19,byteswap)
278	lvx const2,`0`,r3
279	ori r2,r2,`0`
280
281	vxor v4,v4,v12
282	VPMSUMD(v12,v20,const1)
283	lvx v20,off64,r4
284	VPERM(v20,v20,v20,byteswap)
285	ori r2,r2,`0`
286
287	vxor v5,v5,v13
288	VPMSUMD(v13,v21,const1)
289	lvx v21,off80,r4
290	VPERM(v21,v21,v21,byteswap)
291	ori r2,r2,`0`
292
293	vxor v6,v6,v14
294	VPMSUMD(v14,v22,const1)
295	lvx v22,off96,r4
296	VPERM(v22,v22,v22,byteswap)
297	ori r2,r2,`0`
298
299	vxor v7,v7,v15
300	VPMSUMD(v15,v23,const1)
301	lvx v23,off112,r4
302	VPERM(v23,v23,v23,byteswap)
303
304	addi r4,r4,`8`*`16`
305
306	bdnz `4b`
307
308	.Lfirst_cool_down:
309	/ First cool down pass /
310	lvx const1,`0`,r3
311	addi r3,r3,`16`
312
313	vxor v0,v0,v8
314	VPMSUMD(v8,v16,const1)
315	ori r2,r2,`0`
316
317	vxor v1,v1,v9
318	VPMSUMD(v9,v17,const1)
319	ori r2,r2,`0`
320
321	vxor v2,v2,v10
322	VPMSUMD(v10,v18,const1)
323	ori r2,r2,`0`
324
325	vxor v3,v3,v11
326	VPMSUMD(v11,v19,const1)
327	ori r2,r2,`0`
328
329	vxor v4,v4,v12
330	VPMSUMD(v12,v20,const1)
331	ori r2,r2,`0`
332
333	vxor v5,v5,v13
334	VPMSUMD(v13,v21,const1)
335	ori r2,r2,`0`
336
337	vxor v6,v6,v14
338	VPMSUMD(v14,v22,const1)
339	ori r2,r2,`0`
340
341	vxor v7,v7,v15
342	VPMSUMD(v15,v23,const1)
343	ori r2,r2,`0`
344
345	.Lsecond_cool_down:
346	/ Second cool down pass /
347	vxor v0,v0,v8
348	vxor v1,v1,v9
349	vxor v2,v2,v10
350	vxor v3,v3,v11
351	vxor v4,v4,v12
352	vxor v5,v5,v13
353	vxor v6,v6,v14
354	vxor v7,v7,v15
355
356	#ifdef REFLECT
357	/*
358	* vpmsumd produces a 96 bit result in the least significant bits
359	* of the register. Since we are bit reflected we have to shift it
360	* left 32 bits so it occupies the least significant bits in the
361	* bit reflected domain.
362	*/
363	vsldoi v0,v0,zeroes,`4`
364	vsldoi v1,v1,zeroes,`4`
365	vsldoi v2,v2,zeroes,`4`
366	vsldoi v3,v3,zeroes,`4`
367	vsldoi v4,v4,zeroes,`4`
368	vsldoi v5,v5,zeroes,`4`
369	vsldoi v6,v6,zeroes,`4`
370	vsldoi v7,v7,zeroes,`4`
371	#endif
372
373	/ xor with last 1024 bits /
374	lvx v8,`0`,r4
375	lvx v9,off16,r4
376	VPERM(v8,v8,v8,byteswap)
377	VPERM(v9,v9,v9,byteswap)
378	lvx v10,off32,r4
379	lvx v11,off48,r4
380	VPERM(v10,v10,v10,byteswap)
381	VPERM(v11,v11,v11,byteswap)
382	lvx v12,off64,r4
383	lvx v13,off80,r4
384	VPERM(v12,v12,v12,byteswap)
385	VPERM(v13,v13,v13,byteswap)
386	lvx v14,off96,r4
387	lvx v15,off112,r4
388	VPERM(v14,v14,v14,byteswap)
389	VPERM(v15,v15,v15,byteswap)
390
391	addi r4,r4,`8`*`16`
392
393	vxor v16,v0,v8
394	vxor v17,v1,v9
395	vxor v18,v2,v10
396	vxor v19,v3,v11
397	vxor v20,v4,v12
398	vxor v21,v5,v13
399	vxor v22,v6,v14
400	vxor v23,v7,v15
401
402	li r0,`1`
403	cmpdi r6,`0`
404	addi r6,r6,`128`
405	bne `1b`
406
407	/ Work out how many bytes we have left /
408	andi. r5,r5,`127`
409
410	/ Calculate where in the constant table we need to start /
411	subfic r6,r5,`128`
412	add r3,r3,r6
413
414	/ How many 16 byte chunks are in the tail /
415	srdi r7,r5,`4`
416	mtctr r7
417
418	/*
419	* Reduce the previously calculated 1024 bits to 64 bits, shifting
420	* 32 bits to include the trailing 32 bits of zeros
421	*/
422	lvx v0,`0`,r3
423	lvx v1,off16,r3
424	lvx v2,off32,r3
425	lvx v3,off48,r3
426	lvx v4,off64,r3
427	lvx v5,off80,r3
428	lvx v6,off96,r3
429	lvx v7,off112,r3
430	addi r3,r3,`8`*`16`
431
432	VPMSUMW(v0,v16,v0)
433	VPMSUMW(v1,v17,v1)
434	VPMSUMW(v2,v18,v2)
435	VPMSUMW(v3,v19,v3)
436	VPMSUMW(v4,v20,v4)
437	VPMSUMW(v5,v21,v5)
438	VPMSUMW(v6,v22,v6)
439	VPMSUMW(v7,v23,v7)
440
441	/ Now reduce the tail (0 - 112 bytes) /
442	cmpdi r7,`0`
443	beq `1f`
444
445	lvx v16,`0`,r4
446	lvx v17,`0`,r3
447	VPERM(v16,v16,v16,byteswap)
448	VPMSUMW(v16,v16,v17)
449	vxor v0,v0,v16
450	bdz `1f`
451
452	lvx v16,off16,r4
453	lvx v17,off16,r3
454	VPERM(v16,v16,v16,byteswap)
455	VPMSUMW(v16,v16,v17)
456	vxor v0,v0,v16
457	bdz `1f`
458
459	lvx v16,off32,r4
460	lvx v17,off32,r3
461	VPERM(v16,v16,v16,byteswap)
462	VPMSUMW(v16,v16,v17)
463	vxor v0,v0,v16
464	bdz `1f`
465
466	lvx v16,off48,r4
467	lvx v17,off48,r3
468	VPERM(v16,v16,v16,byteswap)
469	VPMSUMW(v16,v16,v17)
470	vxor v0,v0,v16
471	bdz `1f`
472
473	lvx v16,off64,r4
474	lvx v17,off64,r3
475	VPERM(v16,v16,v16,byteswap)
476	VPMSUMW(v16,v16,v17)
477	vxor v0,v0,v16
478	bdz `1f`
479
480	lvx v16,off80,r4
481	lvx v17,off80,r3
482	VPERM(v16,v16,v16,byteswap)
483	VPMSUMW(v16,v16,v17)
484	vxor v0,v0,v16
485	bdz `1f`
486
487	lvx v16,off96,r4
488	lvx v17,off96,r3
489	VPERM(v16,v16,v16,byteswap)
490	VPMSUMW(v16,v16,v17)
491	vxor v0,v0,v16
492
493	/ Now xor all the parallel chunks together /
494	`1`: vxor v0,v0,v1
495	vxor v2,v2,v3
496	vxor v4,v4,v5
497	vxor v6,v6,v7
498
499	vxor v0,v0,v2
500	vxor v4,v4,v6
501
502	vxor v0,v0,v4
503
504	.Lbarrett_reduction:
505	/ Barrett constants /
506	LOAD_REG_ADDR(r3, .barrett_constants)
507
508	lvx const1,`0`,r3
509	lvx const2,off16,r3
510
511	vsldoi v1,v0,v0,`8`
512	vxor v0,v0,v1 / xor two 64 bit results together /
513
514	#ifdef REFLECT
515	/ shift left one bit /
516	vspltisb v1,`1`
517	vsl v0,v0,v1
518	#endif
519
520	vand v0,v0,mask_64bit
521	#ifndef REFLECT
522	/*
523	* Now for the Barrett reduction algorithm. The idea is to calculate q,
524	* the multiple of our polynomial that we need to subtract. By
525	* doing the computation 2x bits higher (ie 64 bits) and shifting the
526	* result back down 2x bits, we round down to the nearest multiple.
527	*/
528	VPMSUMD(v1,v0,const1) / ma /
529	vsldoi v1,zeroes,v1,`8` / q = floor(ma/(2^64)) /
530	VPMSUMD(v1,v1,const2) / qn /
531	vxor v0,v0,v1 / a - qn, subtraction is xor in GF(2) /
532
533	/*
534	* Get the result into r3. We need to shift it left 8 bytes:
535	* V0 [ 0 1 2 X ]
536	* V0 [ 0 X 2 3 ]
537	*/
538	vsldoi v0,v0,zeroes,`8` / shift result into top 64 bits /
539	#else
540	/*
541	* The reflected version of Barrett reduction. Instead of bit
542	* reflecting our data (which is expensive to do), we bit reflect our
543	* constants and our algorithm, which means the intermediate data in
544	* our vector registers goes from 0-63 instead of 63-0. We can reflect
545	* the algorithm because we don't carry in mod 2 arithmetic.
546	*/
547	vand v1,v0,mask_32bit / bottom 32 bits of a /
548	VPMSUMD(v1,v1,const1) / ma /
549	vand v1,v1,mask_32bit / bottom 32bits of ma /
550	VPMSUMD(v1,v1,const2) / qn /
551	vxor v0,v0,v1 / a - qn, subtraction is xor in GF(2) /
552
553	/*
554	* Since we are bit reflected, the result (ie the low 32 bits) is in
555	* the high 32 bits. We just need to shift it left 4 bytes
556	* V0 [ 0 1 X 3 ]
557	* V0 [ 0 X 2 3 ]
558	*/
559	vsldoi v0,v0,zeroes,`4` / shift result into top 64 bits of /
560	#endif
561
562	/ Get it into r3 /
563	MFVRD(R3, v0)
564
565	.Lout:
566	subi r6,r1,`56`+`10`*`16`
567	subi r7,r1,`56`+`2`*`16`
568
569	lvx v20,`0`,r6
570	lvx v21,off16,r6
571	lvx v22,off32,r6
572	lvx v23,off48,r6
573	lvx v24,off64,r6
574	lvx v25,off80,r6
575	lvx v26,off96,r6
576	lvx v27,off112,r6
577	lvx v28,`0`,r7
578	lvx v29,off16,r7
579
580	ld r31,-`8`(r1)
581	ld r30,-`16`(r1)
582	ld r29,-`24`(r1)
583	ld r28,-`32`(r1)
584	ld r27,-`40`(r1)
585	ld r26,-`48`(r1)
586	ld r25,-`56`(r1)
587
588	blr
589
590	.Lfirst_warm_up_done:
591	lvx const1,`0`,r3
592	addi r3,r3,`16`
593
594	VPMSUMD(v8,v16,const1)
595	VPMSUMD(v9,v17,const1)
596	VPMSUMD(v10,v18,const1)
597	VPMSUMD(v11,v19,const1)
598	VPMSUMD(v12,v20,const1)
599	VPMSUMD(v13,v21,const1)
600	VPMSUMD(v14,v22,const1)
601	VPMSUMD(v15,v23,const1)
602
603	b .Lsecond_cool_down
604
605	.Lshort:
606	cmpdi r5,`0`
607	beq .Lzero
608
609	LOAD_REG_ADDR(r3, .short_constants)
610
611	/ Calculate where in the constant table we need to start /
612	subfic r6,r5,`256`
613	add r3,r3,r6
614
615	/ How many 16 byte chunks? /
616	srdi r7,r5,`4`
617	mtctr r7
618
619	vxor v19,v19,v19
620	vxor v20,v20,v20
621
622	lvx v0,`0`,r4
623	lvx v16,`0`,r3
624	VPERM(v0,v0,v16,byteswap)
625	vxor v0,v0,v8 / xor in initial value /
626	VPMSUMW(v0,v0,v16)
627	bdz .Lv0
628
629	lvx v1,off16,r4
630	lvx v17,off16,r3
631	VPERM(v1,v1,v17,byteswap)
632	VPMSUMW(v1,v1,v17)
633	bdz .Lv1
634
635	lvx v2,off32,r4
636	lvx v16,off32,r3
637	VPERM(v2,v2,v16,byteswap)
638	VPMSUMW(v2,v2,v16)
639	bdz .Lv2
640
641	lvx v3,off48,r4
642	lvx v17,off48,r3
643	VPERM(v3,v3,v17,byteswap)
644	VPMSUMW(v3,v3,v17)
645	bdz .Lv3
646
647	lvx v4,off64,r4
648	lvx v16,off64,r3
649	VPERM(v4,v4,v16,byteswap)
650	VPMSUMW(v4,v4,v16)
651	bdz .Lv4
652
653	lvx v5,off80,r4
654	lvx v17,off80,r3
655	VPERM(v5,v5,v17,byteswap)
656	VPMSUMW(v5,v5,v17)
657	bdz .Lv5
658
659	lvx v6,off96,r4
660	lvx v16,off96,r3
661	VPERM(v6,v6,v16,byteswap)
662	VPMSUMW(v6,v6,v16)
663	bdz .Lv6
664
665	lvx v7,off112,r4
666	lvx v17,off112,r3
667	VPERM(v7,v7,v17,byteswap)
668	VPMSUMW(v7,v7,v17)
669	bdz .Lv7
670
671	addi r3,r3,`128`
672	addi r4,r4,`128`
673
674	lvx v8,`0`,r4
675	lvx v16,`0`,r3
676	VPERM(v8,v8,v16,byteswap)
677	VPMSUMW(v8,v8,v16)
678	bdz .Lv8
679
680	lvx v9,off16,r4
681	lvx v17,off16,r3
682	VPERM(v9,v9,v17,byteswap)
683	VPMSUMW(v9,v9,v17)
684	bdz .Lv9
685
686	lvx v10,off32,r4
687	lvx v16,off32,r3
688	VPERM(v10,v10,v16,byteswap)
689	VPMSUMW(v10,v10,v16)
690	bdz .Lv10
691
692	lvx v11,off48,r4
693	lvx v17,off48,r3
694	VPERM(v11,v11,v17,byteswap)
695	VPMSUMW(v11,v11,v17)
696	bdz .Lv11
697
698	lvx v12,off64,r4
699	lvx v16,off64,r3
700	VPERM(v12,v12,v16,byteswap)
701	VPMSUMW(v12,v12,v16)
702	bdz .Lv12
703
704	lvx v13,off80,r4
705	lvx v17,off80,r3
706	VPERM(v13,v13,v17,byteswap)
707	VPMSUMW(v13,v13,v17)
708	bdz .Lv13
709
710	lvx v14,off96,r4
711	lvx v16,off96,r3
712	VPERM(v14,v14,v16,byteswap)
713	VPMSUMW(v14,v14,v16)
714	bdz .Lv14
715
716	lvx v15,off112,r4
717	lvx v17,off112,r3
718	VPERM(v15,v15,v17,byteswap)
719	VPMSUMW(v15,v15,v17)
720
721	.Lv15: vxor v19,v19,v15
722	.Lv14: vxor v20,v20,v14
723	.Lv13: vxor v19,v19,v13
724	.Lv12: vxor v20,v20,v12
725	.Lv11: vxor v19,v19,v11
726	.Lv10: vxor v20,v20,v10
727	.Lv9: vxor v19,v19,v9
728	.Lv8: vxor v20,v20,v8
729	.Lv7: vxor v19,v19,v7
730	.Lv6: vxor v20,v20,v6
731	.Lv5: vxor v19,v19,v5
732	.Lv4: vxor v20,v20,v4
733	.Lv3: vxor v19,v19,v3
734	.Lv2: vxor v20,v20,v2
735	.Lv1: vxor v19,v19,v1
736	.Lv0: vxor v20,v20,v0
737
738	vxor v0,v19,v20
739
740	b .Lbarrett_reduction
741
742	.Lzero:
743	mr r3,r10
744	b .Lout
745
746	FUNC_END(CRC_FUNCTION_NAME)
747

source code of linux/arch/powerpc/crypto/crc32-vpmsum_core.S