checksum_64.S source code [linux/arch/powerpc/lib/checksum_64.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* This file contains assembly-language implementations
4	* of IP-style 1's complement checksum routines.
5	*
6	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7	*
8	* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9	*/
10
11	#include <linux/export.h>
12	#include <linux/sys.h>
13	#include <asm/processor.h>
14	#include <asm/errno.h>
15	#include <asm/ppc_asm.h>
16
17	/*
18	* Computes the checksum of a memory block at buff, length len,
19	* and adds in "sum" (32-bit).
20	*
21	* __csum_partial(r3=buff, r4=len, r5=sum)
22	*/
23	_GLOBAL(__csum_partial)
24	addic r0,r5,`0` / clear carry /
25
26	srdi. r6,r4,`3` / less than 8 bytes? /
27	beq .Lcsum_tail_word
28
29	/*
30	* If only halfword aligned, align to a double word. Since odd
31	* aligned addresses should be rare and they would require more
32	* work to calculate the correct checksum, we ignore that case
33	* and take the potential slowdown of unaligned loads.
34	*/
35	rldicl. r6,r3,`64`-`1`,`64`-`2` / r6 = (r3 >> 1) & 0x3 /
36	beq .Lcsum_aligned
37
38	li r7,`4`
39	sub r6,r7,r6
40	mtctr r6
41
42	`1`:
43	lhz r6,`0`(r3) / align to doubleword /
44	subi r4,r4,`2`
45	addi r3,r3,`2`
46	adde r0,r0,r6
47	bdnz `1b`
48
49	.Lcsum_aligned:
50	/*
51	* We unroll the loop such that each iteration is 64 bytes with an
52	* entry and exit limb of 64 bytes, meaning a minimum size of
53	* 128 bytes.
54	*/
55	srdi. r6,r4,`7`
56	beq .Lcsum_tail_doublewords / len < 128 /
57
58	srdi r6,r4,`6`
59	subi r6,r6,`1`
60	mtctr r6
61
62	stdu r1,-STACKFRAMESIZE(r1)
63	std r14,STK_REG(R14)(r1)
64	std r15,STK_REG(R15)(r1)
65	std r16,STK_REG(R16)(r1)
66
67	ld r6,`0`(r3)
68	ld r9,`8`(r3)
69
70	ld r10,`16`(r3)
71	ld r11,`24`(r3)
72
73	/*
74	* On POWER6 and POWER7 back to back adde instructions take 2 cycles
75	* because of the XER dependency. This means the fastest this loop can
76	* go is 16 cycles per iteration. The scheduling of the loop below has
77	* been shown to hit this on both POWER6 and POWER7.
78	*/
79	.align `5`
80	`2`:
81	adde r0,r0,r6
82	ld r12,`32`(r3)
83	ld r14,`40`(r3)
84
85	adde r0,r0,r9
86	ld r15,`48`(r3)
87	ld r16,`56`(r3)
88	addi r3,r3,`64`
89
90	adde r0,r0,r10
91
92	adde r0,r0,r11
93
94	adde r0,r0,r12
95
96	adde r0,r0,r14
97
98	adde r0,r0,r15
99	ld r6,`0`(r3)
100	ld r9,`8`(r3)
101
102	adde r0,r0,r16
103	ld r10,`16`(r3)
104	ld r11,`24`(r3)
105	bdnz `2b`
106
107
108	adde r0,r0,r6
109	ld r12,`32`(r3)
110	ld r14,`40`(r3)
111
112	adde r0,r0,r9
113	ld r15,`48`(r3)
114	ld r16,`56`(r3)
115	addi r3,r3,`64`
116
117	adde r0,r0,r10
118	adde r0,r0,r11
119	adde r0,r0,r12
120	adde r0,r0,r14
121	adde r0,r0,r15
122	adde r0,r0,r16
123
124	ld r14,STK_REG(R14)(r1)
125	ld r15,STK_REG(R15)(r1)
126	ld r16,STK_REG(R16)(r1)
127	addi r1,r1,STACKFRAMESIZE
128
129	andi. r4,r4,`63`
130
131	.Lcsum_tail_doublewords: / Up to 127 bytes to go /
132	srdi. r6,r4,`3`
133	beq .Lcsum_tail_word
134
135	mtctr r6
136	`3`:
137	ld r6,`0`(r3)
138	addi r3,r3,`8`
139	adde r0,r0,r6
140	bdnz `3b`
141
142	andi. r4,r4,`7`
143
144	.Lcsum_tail_word: / Up to 7 bytes to go /
145	srdi. r6,r4,`2`
146	beq .Lcsum_tail_halfword
147
148	lwz r6,`0`(r3)
149	addi r3,r3,`4`
150	adde r0,r0,r6
151	subi r4,r4,`4`
152
153	.Lcsum_tail_halfword: / Up to 3 bytes to go /
154	srdi. r6,r4,`1`
155	beq .Lcsum_tail_byte
156
157	lhz r6,`0`(r3)
158	addi r3,r3,`2`
159	adde r0,r0,r6
160	subi r4,r4,`2`
161
162	.Lcsum_tail_byte: / Up to 1 byte to go /
163	andi. r6,r4,`1`
164	beq .Lcsum_finish
165
166	lbz r6,`0`(r3)
167	#ifdef __BIG_ENDIAN__
168	sldi r9,r6,`8` / Pad the byte out to 16 bits /
169	adde r0,r0,r9
170	#else
171	adde r0,r0,r6
172	#endif
173
174	.Lcsum_finish:
175	addze r0,r0 / add in final carry /
176	rldicl r4,r0,`32`,`0` / fold two 32 bit halves together /
177	add r3,r4,r0
178	srdi r3,r3,`32`
179	blr
180	EXPORT_SYMBOL(__csum_partial)
181
182
183	.macro srcnr
184	`100`:
185	EX_TABLE(`100b`,.Lerror_nr)
186	.endm
187
188	.macro source
189	`150`:
190	EX_TABLE(`150b`,.Lerror)
191	.endm
192
193	.macro dstnr
194	`200`:
195	EX_TABLE(`200b`,.Lerror_nr)
196	.endm
197
198	.macro dest
199	`250`:
200	EX_TABLE(`250b`,.Lerror)
201	.endm
202
203	/*
204	* Computes the checksum of a memory block at src, length len,
205	* and adds in 0xffffffff (32-bit), while copying the block to dst.
206	* If an access exception occurs, it returns 0.
207	*
208	* csum_partial_copy_generic(r3=src, r4=dst, r5=len)
209	*/
210	_GLOBAL(csum_partial_copy_generic)
211	li r6,-`1`
212	addic r0,r6,`0` / clear carry /
213
214	srdi. r6,r5,`3` / less than 8 bytes? /
215	beq .Lcopy_tail_word
216
217	/*
218	* If only halfword aligned, align to a double word. Since odd
219	* aligned addresses should be rare and they would require more
220	* work to calculate the correct checksum, we ignore that case
221	* and take the potential slowdown of unaligned loads.
222	*
223	* If the source and destination are relatively unaligned we only
224	* align the source. This keeps things simple.
225	*/
226	rldicl. r6,r3,`64`-`1`,`64`-`2` / r6 = (r3 >> 1) & 0x3 /
227	beq .Lcopy_aligned
228
229	li r9,`4`
230	sub r6,r9,r6
231	mtctr r6
232
233	`1`:
234	srcnr; lhz r6,`0`(r3) / align to doubleword /
235	subi r5,r5,`2`
236	addi r3,r3,`2`
237	adde r0,r0,r6
238	dstnr; sth r6,`0`(r4)
239	addi r4,r4,`2`
240	bdnz `1b`
241
242	.Lcopy_aligned:
243	/*
244	* We unroll the loop such that each iteration is 64 bytes with an
245	* entry and exit limb of 64 bytes, meaning a minimum size of
246	* 128 bytes.
247	*/
248	srdi. r6,r5,`7`
249	beq .Lcopy_tail_doublewords / len < 128 /
250
251	srdi r6,r5,`6`
252	subi r6,r6,`1`
253	mtctr r6
254
255	stdu r1,-STACKFRAMESIZE(r1)
256	std r14,STK_REG(R14)(r1)
257	std r15,STK_REG(R15)(r1)
258	std r16,STK_REG(R16)(r1)
259
260	source; ld r6,`0`(r3)
261	source; ld r9,`8`(r3)
262
263	source; ld r10,`16`(r3)
264	source; ld r11,`24`(r3)
265
266	/*
267	* On POWER6 and POWER7 back to back adde instructions take 2 cycles
268	* because of the XER dependency. This means the fastest this loop can
269	* go is 16 cycles per iteration. The scheduling of the loop below has
270	* been shown to hit this on both POWER6 and POWER7.
271	*/
272	.align `5`
273	`2`:
274	adde r0,r0,r6
275	source; ld r12,`32`(r3)
276	source; ld r14,`40`(r3)
277
278	adde r0,r0,r9
279	source; ld r15,`48`(r3)
280	source; ld r16,`56`(r3)
281	addi r3,r3,`64`
282
283	adde r0,r0,r10
284	dest; std r6,`0`(r4)
285	dest; std r9,`8`(r4)
286
287	adde r0,r0,r11
288	dest; std r10,`16`(r4)
289	dest; std r11,`24`(r4)
290
291	adde r0,r0,r12
292	dest; std r12,`32`(r4)
293	dest; std r14,`40`(r4)
294
295	adde r0,r0,r14
296	dest; std r15,`48`(r4)
297	dest; std r16,`56`(r4)
298	addi r4,r4,`64`
299
300	adde r0,r0,r15
301	source; ld r6,`0`(r3)
302	source; ld r9,`8`(r3)
303
304	adde r0,r0,r16
305	source; ld r10,`16`(r3)
306	source; ld r11,`24`(r3)
307	bdnz `2b`
308
309
310	adde r0,r0,r6
311	source; ld r12,`32`(r3)
312	source; ld r14,`40`(r3)
313
314	adde r0,r0,r9
315	source; ld r15,`48`(r3)
316	source; ld r16,`56`(r3)
317	addi r3,r3,`64`
318
319	adde r0,r0,r10
320	dest; std r6,`0`(r4)
321	dest; std r9,`8`(r4)
322
323	adde r0,r0,r11
324	dest; std r10,`16`(r4)
325	dest; std r11,`24`(r4)
326
327	adde r0,r0,r12
328	dest; std r12,`32`(r4)
329	dest; std r14,`40`(r4)
330
331	adde r0,r0,r14
332	dest; std r15,`48`(r4)
333	dest; std r16,`56`(r4)
334	addi r4,r4,`64`
335
336	adde r0,r0,r15
337	adde r0,r0,r16
338
339	ld r14,STK_REG(R14)(r1)
340	ld r15,STK_REG(R15)(r1)
341	ld r16,STK_REG(R16)(r1)
342	addi r1,r1,STACKFRAMESIZE
343
344	andi. r5,r5,`63`
345
346	.Lcopy_tail_doublewords: / Up to 127 bytes to go /
347	srdi. r6,r5,`3`
348	beq .Lcopy_tail_word
349
350	mtctr r6
351	`3`:
352	srcnr; ld r6,`0`(r3)
353	addi r3,r3,`8`
354	adde r0,r0,r6
355	dstnr; std r6,`0`(r4)
356	addi r4,r4,`8`
357	bdnz `3b`
358
359	andi. r5,r5,`7`
360
361	.Lcopy_tail_word: / Up to 7 bytes to go /
362	srdi. r6,r5,`2`
363	beq .Lcopy_tail_halfword
364
365	srcnr; lwz r6,`0`(r3)
366	addi r3,r3,`4`
367	adde r0,r0,r6
368	dstnr; stw r6,`0`(r4)
369	addi r4,r4,`4`
370	subi r5,r5,`4`
371
372	.Lcopy_tail_halfword: / Up to 3 bytes to go /
373	srdi. r6,r5,`1`
374	beq .Lcopy_tail_byte
375
376	srcnr; lhz r6,`0`(r3)
377	addi r3,r3,`2`
378	adde r0,r0,r6
379	dstnr; sth r6,`0`(r4)
380	addi r4,r4,`2`
381	subi r5,r5,`2`
382
383	.Lcopy_tail_byte: / Up to 1 byte to go /
384	andi. r6,r5,`1`
385	beq .Lcopy_finish
386
387	srcnr; lbz r6,`0`(r3)
388	#ifdef __BIG_ENDIAN__
389	sldi r9,r6,`8` / Pad the byte out to 16 bits /
390	adde r0,r0,r9
391	#else
392	adde r0,r0,r6
393	#endif
394	dstnr; stb r6,`0`(r4)
395
396	.Lcopy_finish:
397	addze r0,r0 / add in final carry /
398	rldicl r4,r0,`32`,`0` / fold two 32 bit halves together /
399	add r3,r4,r0
400	srdi r3,r3,`32`
401	blr
402
403	.Lerror:
404	ld r14,STK_REG(R14)(r1)
405	ld r15,STK_REG(R15)(r1)
406	ld r16,STK_REG(R16)(r1)
407	addi r1,r1,STACKFRAMESIZE
408	.Lerror_nr:
409	li r3,`0`
410	blr
411
412	EXPORT_SYMBOL(csum_partial_copy_generic)
413
414	/*
415	* __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416	* const struct in6_addr *daddr,
417	* __u32 len, __u8 proto, __wsum sum)
418	*/
419
420	_GLOBAL(csum_ipv6_magic)
421	ld r8, `0`(r3)
422	ld r9, `8`(r3)
423	add r5, r5, r6
424	addc r0, r8, r9
425	ld r10, `0`(r4)
426	ld r11, `8`(r4)
427	#ifdef CONFIG_CPU_LITTLE_ENDIAN
428	rotldi r5, r5, `8`
429	#endif
430	adde r0, r0, r10
431	add r5, r5, r7
432	adde r0, r0, r11
433	adde r0, r0, r5
434	addze r0, r0
435	rotldi r3, r0, `32` / fold two 32 bit halves together /
436	add r3, r0, r3
437	srdi r0, r3, `32`
438	rotlwi r3, r0, `16` / fold two 16 bit halves together /
439	add r3, r0, r3
440	not r3, r3
441	rlwinm r3, r3, `16`, `16`, `31`
442	blr
443	EXPORT_SYMBOL(csum_ipv6_magic)
444

source code of linux/arch/powerpc/lib/checksum_64.S