1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * This file contains assembly-language implementations
4 * of IP-style 1's complement checksum routines.
5 *
6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9 */
10
11#include <linux/export.h>
12#include <linux/sys.h>
13#include <asm/processor.h>
14#include <asm/errno.h>
15#include <asm/ppc_asm.h>
16
17/*
18 * Computes the checksum of a memory block at buff, length len,
19 * and adds in "sum" (32-bit).
20 *
21 * __csum_partial(r3=buff, r4=len, r5=sum)
22 */
23_GLOBAL(__csum_partial)
24 addic r0,r5,0 /* clear carry */
25
26 srdi. r6,r4,3 /* less than 8 bytes? */
27 beq .Lcsum_tail_word
28
29 /*
30 * If only halfword aligned, align to a double word. Since odd
31 * aligned addresses should be rare and they would require more
32 * work to calculate the correct checksum, we ignore that case
33 * and take the potential slowdown of unaligned loads.
34 */
35 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
36 beq .Lcsum_aligned
37
38 li r7,4
39 sub r6,r7,r6
40 mtctr r6
41
421:
43 lhz r6,0(r3) /* align to doubleword */
44 subi r4,r4,2
45 addi r3,r3,2
46 adde r0,r0,r6
47 bdnz 1b
48
49.Lcsum_aligned:
50 /*
51 * We unroll the loop such that each iteration is 64 bytes with an
52 * entry and exit limb of 64 bytes, meaning a minimum size of
53 * 128 bytes.
54 */
55 srdi. r6,r4,7
56 beq .Lcsum_tail_doublewords /* len < 128 */
57
58 srdi r6,r4,6
59 subi r6,r6,1
60 mtctr r6
61
62 stdu r1,-STACKFRAMESIZE(r1)
63 std r14,STK_REG(R14)(r1)
64 std r15,STK_REG(R15)(r1)
65 std r16,STK_REG(R16)(r1)
66
67 ld r6,0(r3)
68 ld r9,8(r3)
69
70 ld r10,16(r3)
71 ld r11,24(r3)
72
73 /*
74 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
75 * because of the XER dependency. This means the fastest this loop can
76 * go is 16 cycles per iteration. The scheduling of the loop below has
77 * been shown to hit this on both POWER6 and POWER7.
78 */
79 .align 5
802:
81 adde r0,r0,r6
82 ld r12,32(r3)
83 ld r14,40(r3)
84
85 adde r0,r0,r9
86 ld r15,48(r3)
87 ld r16,56(r3)
88 addi r3,r3,64
89
90 adde r0,r0,r10
91
92 adde r0,r0,r11
93
94 adde r0,r0,r12
95
96 adde r0,r0,r14
97
98 adde r0,r0,r15
99 ld r6,0(r3)
100 ld r9,8(r3)
101
102 adde r0,r0,r16
103 ld r10,16(r3)
104 ld r11,24(r3)
105 bdnz 2b
106
107
108 adde r0,r0,r6
109 ld r12,32(r3)
110 ld r14,40(r3)
111
112 adde r0,r0,r9
113 ld r15,48(r3)
114 ld r16,56(r3)
115 addi r3,r3,64
116
117 adde r0,r0,r10
118 adde r0,r0,r11
119 adde r0,r0,r12
120 adde r0,r0,r14
121 adde r0,r0,r15
122 adde r0,r0,r16
123
124 ld r14,STK_REG(R14)(r1)
125 ld r15,STK_REG(R15)(r1)
126 ld r16,STK_REG(R16)(r1)
127 addi r1,r1,STACKFRAMESIZE
128
129 andi. r4,r4,63
130
131.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
132 srdi. r6,r4,3
133 beq .Lcsum_tail_word
134
135 mtctr r6
1363:
137 ld r6,0(r3)
138 addi r3,r3,8
139 adde r0,r0,r6
140 bdnz 3b
141
142 andi. r4,r4,7
143
144.Lcsum_tail_word: /* Up to 7 bytes to go */
145 srdi. r6,r4,2
146 beq .Lcsum_tail_halfword
147
148 lwz r6,0(r3)
149 addi r3,r3,4
150 adde r0,r0,r6
151 subi r4,r4,4
152
153.Lcsum_tail_halfword: /* Up to 3 bytes to go */
154 srdi. r6,r4,1
155 beq .Lcsum_tail_byte
156
157 lhz r6,0(r3)
158 addi r3,r3,2
159 adde r0,r0,r6
160 subi r4,r4,2
161
162.Lcsum_tail_byte: /* Up to 1 byte to go */
163 andi. r6,r4,1
164 beq .Lcsum_finish
165
166 lbz r6,0(r3)
167#ifdef __BIG_ENDIAN__
168 sldi r9,r6,8 /* Pad the byte out to 16 bits */
169 adde r0,r0,r9
170#else
171 adde r0,r0,r6
172#endif
173
174.Lcsum_finish:
175 addze r0,r0 /* add in final carry */
176 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
177 add r3,r4,r0
178 srdi r3,r3,32
179 blr
180EXPORT_SYMBOL(__csum_partial)
181
182
183 .macro srcnr
184100:
185 EX_TABLE(100b,.Lerror_nr)
186 .endm
187
188 .macro source
189150:
190 EX_TABLE(150b,.Lerror)
191 .endm
192
193 .macro dstnr
194200:
195 EX_TABLE(200b,.Lerror_nr)
196 .endm
197
198 .macro dest
199250:
200 EX_TABLE(250b,.Lerror)
201 .endm
202
203/*
204 * Computes the checksum of a memory block at src, length len,
205 * and adds in 0xffffffff (32-bit), while copying the block to dst.
206 * If an access exception occurs, it returns 0.
207 *
208 * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
209 */
210_GLOBAL(csum_partial_copy_generic)
211 li r6,-1
212 addic r0,r6,0 /* clear carry */
213
214 srdi. r6,r5,3 /* less than 8 bytes? */
215 beq .Lcopy_tail_word
216
217 /*
218 * If only halfword aligned, align to a double word. Since odd
219 * aligned addresses should be rare and they would require more
220 * work to calculate the correct checksum, we ignore that case
221 * and take the potential slowdown of unaligned loads.
222 *
223 * If the source and destination are relatively unaligned we only
224 * align the source. This keeps things simple.
225 */
226 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
227 beq .Lcopy_aligned
228
229 li r9,4
230 sub r6,r9,r6
231 mtctr r6
232
2331:
234srcnr; lhz r6,0(r3) /* align to doubleword */
235 subi r5,r5,2
236 addi r3,r3,2
237 adde r0,r0,r6
238dstnr; sth r6,0(r4)
239 addi r4,r4,2
240 bdnz 1b
241
242.Lcopy_aligned:
243 /*
244 * We unroll the loop such that each iteration is 64 bytes with an
245 * entry and exit limb of 64 bytes, meaning a minimum size of
246 * 128 bytes.
247 */
248 srdi. r6,r5,7
249 beq .Lcopy_tail_doublewords /* len < 128 */
250
251 srdi r6,r5,6
252 subi r6,r6,1
253 mtctr r6
254
255 stdu r1,-STACKFRAMESIZE(r1)
256 std r14,STK_REG(R14)(r1)
257 std r15,STK_REG(R15)(r1)
258 std r16,STK_REG(R16)(r1)
259
260source; ld r6,0(r3)
261source; ld r9,8(r3)
262
263source; ld r10,16(r3)
264source; ld r11,24(r3)
265
266 /*
267 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
268 * because of the XER dependency. This means the fastest this loop can
269 * go is 16 cycles per iteration. The scheduling of the loop below has
270 * been shown to hit this on both POWER6 and POWER7.
271 */
272 .align 5
2732:
274 adde r0,r0,r6
275source; ld r12,32(r3)
276source; ld r14,40(r3)
277
278 adde r0,r0,r9
279source; ld r15,48(r3)
280source; ld r16,56(r3)
281 addi r3,r3,64
282
283 adde r0,r0,r10
284dest; std r6,0(r4)
285dest; std r9,8(r4)
286
287 adde r0,r0,r11
288dest; std r10,16(r4)
289dest; std r11,24(r4)
290
291 adde r0,r0,r12
292dest; std r12,32(r4)
293dest; std r14,40(r4)
294
295 adde r0,r0,r14
296dest; std r15,48(r4)
297dest; std r16,56(r4)
298 addi r4,r4,64
299
300 adde r0,r0,r15
301source; ld r6,0(r3)
302source; ld r9,8(r3)
303
304 adde r0,r0,r16
305source; ld r10,16(r3)
306source; ld r11,24(r3)
307 bdnz 2b
308
309
310 adde r0,r0,r6
311source; ld r12,32(r3)
312source; ld r14,40(r3)
313
314 adde r0,r0,r9
315source; ld r15,48(r3)
316source; ld r16,56(r3)
317 addi r3,r3,64
318
319 adde r0,r0,r10
320dest; std r6,0(r4)
321dest; std r9,8(r4)
322
323 adde r0,r0,r11
324dest; std r10,16(r4)
325dest; std r11,24(r4)
326
327 adde r0,r0,r12
328dest; std r12,32(r4)
329dest; std r14,40(r4)
330
331 adde r0,r0,r14
332dest; std r15,48(r4)
333dest; std r16,56(r4)
334 addi r4,r4,64
335
336 adde r0,r0,r15
337 adde r0,r0,r16
338
339 ld r14,STK_REG(R14)(r1)
340 ld r15,STK_REG(R15)(r1)
341 ld r16,STK_REG(R16)(r1)
342 addi r1,r1,STACKFRAMESIZE
343
344 andi. r5,r5,63
345
346.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
347 srdi. r6,r5,3
348 beq .Lcopy_tail_word
349
350 mtctr r6
3513:
352srcnr; ld r6,0(r3)
353 addi r3,r3,8
354 adde r0,r0,r6
355dstnr; std r6,0(r4)
356 addi r4,r4,8
357 bdnz 3b
358
359 andi. r5,r5,7
360
361.Lcopy_tail_word: /* Up to 7 bytes to go */
362 srdi. r6,r5,2
363 beq .Lcopy_tail_halfword
364
365srcnr; lwz r6,0(r3)
366 addi r3,r3,4
367 adde r0,r0,r6
368dstnr; stw r6,0(r4)
369 addi r4,r4,4
370 subi r5,r5,4
371
372.Lcopy_tail_halfword: /* Up to 3 bytes to go */
373 srdi. r6,r5,1
374 beq .Lcopy_tail_byte
375
376srcnr; lhz r6,0(r3)
377 addi r3,r3,2
378 adde r0,r0,r6
379dstnr; sth r6,0(r4)
380 addi r4,r4,2
381 subi r5,r5,2
382
383.Lcopy_tail_byte: /* Up to 1 byte to go */
384 andi. r6,r5,1
385 beq .Lcopy_finish
386
387srcnr; lbz r6,0(r3)
388#ifdef __BIG_ENDIAN__
389 sldi r9,r6,8 /* Pad the byte out to 16 bits */
390 adde r0,r0,r9
391#else
392 adde r0,r0,r6
393#endif
394dstnr; stb r6,0(r4)
395
396.Lcopy_finish:
397 addze r0,r0 /* add in final carry */
398 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
399 add r3,r4,r0
400 srdi r3,r3,32
401 blr
402
403.Lerror:
404 ld r14,STK_REG(R14)(r1)
405 ld r15,STK_REG(R15)(r1)
406 ld r16,STK_REG(R16)(r1)
407 addi r1,r1,STACKFRAMESIZE
408.Lerror_nr:
409 li r3,0
410 blr
411
412EXPORT_SYMBOL(csum_partial_copy_generic)
413
414/*
415 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416 * const struct in6_addr *daddr,
417 * __u32 len, __u8 proto, __wsum sum)
418 */
419
420_GLOBAL(csum_ipv6_magic)
421 ld r8, 0(r3)
422 ld r9, 8(r3)
423 add r5, r5, r6
424 addc r0, r8, r9
425 ld r10, 0(r4)
426 ld r11, 8(r4)
427#ifdef CONFIG_CPU_LITTLE_ENDIAN
428 rotldi r5, r5, 8
429#endif
430 adde r0, r0, r10
431 add r5, r5, r7
432 adde r0, r0, r11
433 adde r0, r0, r5
434 addze r0, r0
435 rotldi r3, r0, 32 /* fold two 32 bit halves together */
436 add r3, r0, r3
437 srdi r0, r3, 32
438 rotlwi r3, r0, 16 /* fold two 16 bit halves together */
439 add r3, r0, r3
440 not r3, r3
441 rlwinm r3, r3, 16, 16, 31
442 blr
443EXPORT_SYMBOL(csum_ipv6_magic)
444

source code of linux/arch/powerpc/lib/checksum_64.S