1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * This file contains assembly-language implementations |
4 | * of IP-style 1's complement checksum routines. |
5 | * |
6 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) |
7 | * |
8 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). |
9 | */ |
10 | |
11 | #include <linux/export.h> |
12 | #include <linux/sys.h> |
13 | #include <asm/processor.h> |
14 | #include <asm/errno.h> |
15 | #include <asm/ppc_asm.h> |
16 | |
17 | /* |
18 | * Computes the checksum of a memory block at buff, length len, |
19 | * and adds in "sum" (32-bit). |
20 | * |
21 | * __csum_partial(r3=buff, r4=len, r5=sum) |
22 | */ |
23 | _GLOBAL(__csum_partial) |
24 | addic r0,r5,0 /* clear carry */ |
25 | |
26 | srdi. r6,r4,3 /* less than 8 bytes? */ |
27 | beq .Lcsum_tail_word |
28 | |
29 | /* |
30 | * If only halfword aligned, align to a double word. Since odd |
31 | * aligned addresses should be rare and they would require more |
32 | * work to calculate the correct checksum, we ignore that case |
33 | * and take the potential slowdown of unaligned loads. |
34 | */ |
35 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ |
36 | beq .Lcsum_aligned |
37 | |
38 | li r7,4 |
39 | sub r6,r7,r6 |
40 | mtctr r6 |
41 | |
42 | 1: |
43 | lhz r6,0(r3) /* align to doubleword */ |
44 | subi r4,r4,2 |
45 | addi r3,r3,2 |
46 | adde r0,r0,r6 |
47 | bdnz 1b |
48 | |
49 | .Lcsum_aligned: |
50 | /* |
51 | * We unroll the loop such that each iteration is 64 bytes with an |
52 | * entry and exit limb of 64 bytes, meaning a minimum size of |
53 | * 128 bytes. |
54 | */ |
55 | srdi. r6,r4,7 |
56 | beq .Lcsum_tail_doublewords /* len < 128 */ |
57 | |
58 | srdi r6,r4,6 |
59 | subi r6,r6,1 |
60 | mtctr r6 |
61 | |
62 | stdu r1,-STACKFRAMESIZE(r1) |
63 | std r14,STK_REG(R14)(r1) |
64 | std r15,STK_REG(R15)(r1) |
65 | std r16,STK_REG(R16)(r1) |
66 | |
67 | ld r6,0(r3) |
68 | ld r9,8(r3) |
69 | |
70 | ld r10,16(r3) |
71 | ld r11,24(r3) |
72 | |
73 | /* |
74 | * On POWER6 and POWER7 back to back adde instructions take 2 cycles |
75 | * because of the XER dependency. This means the fastest this loop can |
76 | * go is 16 cycles per iteration. The scheduling of the loop below has |
77 | * been shown to hit this on both POWER6 and POWER7. |
78 | */ |
79 | .align 5 |
80 | 2: |
81 | adde r0,r0,r6 |
82 | ld r12,32(r3) |
83 | ld r14,40(r3) |
84 | |
85 | adde r0,r0,r9 |
86 | ld r15,48(r3) |
87 | ld r16,56(r3) |
88 | addi r3,r3,64 |
89 | |
90 | adde r0,r0,r10 |
91 | |
92 | adde r0,r0,r11 |
93 | |
94 | adde r0,r0,r12 |
95 | |
96 | adde r0,r0,r14 |
97 | |
98 | adde r0,r0,r15 |
99 | ld r6,0(r3) |
100 | ld r9,8(r3) |
101 | |
102 | adde r0,r0,r16 |
103 | ld r10,16(r3) |
104 | ld r11,24(r3) |
105 | bdnz 2b |
106 | |
107 | |
108 | adde r0,r0,r6 |
109 | ld r12,32(r3) |
110 | ld r14,40(r3) |
111 | |
112 | adde r0,r0,r9 |
113 | ld r15,48(r3) |
114 | ld r16,56(r3) |
115 | addi r3,r3,64 |
116 | |
117 | adde r0,r0,r10 |
118 | adde r0,r0,r11 |
119 | adde r0,r0,r12 |
120 | adde r0,r0,r14 |
121 | adde r0,r0,r15 |
122 | adde r0,r0,r16 |
123 | |
124 | ld r14,STK_REG(R14)(r1) |
125 | ld r15,STK_REG(R15)(r1) |
126 | ld r16,STK_REG(R16)(r1) |
127 | addi r1,r1,STACKFRAMESIZE |
128 | |
129 | andi. r4,r4,63 |
130 | |
131 | .Lcsum_tail_doublewords: /* Up to 127 bytes to go */ |
132 | srdi. r6,r4,3 |
133 | beq .Lcsum_tail_word |
134 | |
135 | mtctr r6 |
136 | 3: |
137 | ld r6,0(r3) |
138 | addi r3,r3,8 |
139 | adde r0,r0,r6 |
140 | bdnz 3b |
141 | |
142 | andi. r4,r4,7 |
143 | |
144 | .Lcsum_tail_word: /* Up to 7 bytes to go */ |
145 | srdi. r6,r4,2 |
146 | beq .Lcsum_tail_halfword |
147 | |
148 | lwz r6,0(r3) |
149 | addi r3,r3,4 |
150 | adde r0,r0,r6 |
151 | subi r4,r4,4 |
152 | |
153 | .Lcsum_tail_halfword: /* Up to 3 bytes to go */ |
154 | srdi. r6,r4,1 |
155 | beq .Lcsum_tail_byte |
156 | |
157 | lhz r6,0(r3) |
158 | addi r3,r3,2 |
159 | adde r0,r0,r6 |
160 | subi r4,r4,2 |
161 | |
162 | .Lcsum_tail_byte: /* Up to 1 byte to go */ |
163 | andi. r6,r4,1 |
164 | beq .Lcsum_finish |
165 | |
166 | lbz r6,0(r3) |
167 | #ifdef __BIG_ENDIAN__ |
168 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ |
169 | adde r0,r0,r9 |
170 | #else |
171 | adde r0,r0,r6 |
172 | #endif |
173 | |
174 | .Lcsum_finish: |
175 | addze r0,r0 /* add in final carry */ |
176 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ |
177 | add r3,r4,r0 |
178 | srdi r3,r3,32 |
179 | blr |
180 | EXPORT_SYMBOL(__csum_partial) |
181 | |
182 | |
183 | .macro srcnr |
184 | 100: |
185 | EX_TABLE(100b,.Lerror_nr) |
186 | .endm |
187 | |
188 | .macro source |
189 | 150: |
190 | EX_TABLE(150b,.Lerror) |
191 | .endm |
192 | |
193 | .macro dstnr |
194 | 200: |
195 | EX_TABLE(200b,.Lerror_nr) |
196 | .endm |
197 | |
198 | .macro dest |
199 | 250: |
200 | EX_TABLE(250b,.Lerror) |
201 | .endm |
202 | |
203 | /* |
204 | * Computes the checksum of a memory block at src, length len, |
205 | * and adds in 0xffffffff (32-bit), while copying the block to dst. |
206 | * If an access exception occurs, it returns 0. |
207 | * |
208 | * csum_partial_copy_generic(r3=src, r4=dst, r5=len) |
209 | */ |
210 | _GLOBAL(csum_partial_copy_generic) |
211 | li r6,-1 |
212 | addic r0,r6,0 /* clear carry */ |
213 | |
214 | srdi. r6,r5,3 /* less than 8 bytes? */ |
215 | beq .Lcopy_tail_word |
216 | |
217 | /* |
218 | * If only halfword aligned, align to a double word. Since odd |
219 | * aligned addresses should be rare and they would require more |
220 | * work to calculate the correct checksum, we ignore that case |
221 | * and take the potential slowdown of unaligned loads. |
222 | * |
223 | * If the source and destination are relatively unaligned we only |
224 | * align the source. This keeps things simple. |
225 | */ |
226 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ |
227 | beq .Lcopy_aligned |
228 | |
229 | li r9,4 |
230 | sub r6,r9,r6 |
231 | mtctr r6 |
232 | |
233 | 1: |
234 | srcnr; lhz r6,0(r3) /* align to doubleword */ |
235 | subi r5,r5,2 |
236 | addi r3,r3,2 |
237 | adde r0,r0,r6 |
238 | dstnr; sth r6,0(r4) |
239 | addi r4,r4,2 |
240 | bdnz 1b |
241 | |
242 | .Lcopy_aligned: |
243 | /* |
244 | * We unroll the loop such that each iteration is 64 bytes with an |
245 | * entry and exit limb of 64 bytes, meaning a minimum size of |
246 | * 128 bytes. |
247 | */ |
248 | srdi. r6,r5,7 |
249 | beq .Lcopy_tail_doublewords /* len < 128 */ |
250 | |
251 | srdi r6,r5,6 |
252 | subi r6,r6,1 |
253 | mtctr r6 |
254 | |
255 | stdu r1,-STACKFRAMESIZE(r1) |
256 | std r14,STK_REG(R14)(r1) |
257 | std r15,STK_REG(R15)(r1) |
258 | std r16,STK_REG(R16)(r1) |
259 | |
260 | source; ld r6,0(r3) |
261 | source; ld r9,8(r3) |
262 | |
263 | source; ld r10,16(r3) |
264 | source; ld r11,24(r3) |
265 | |
266 | /* |
267 | * On POWER6 and POWER7 back to back adde instructions take 2 cycles |
268 | * because of the XER dependency. This means the fastest this loop can |
269 | * go is 16 cycles per iteration. The scheduling of the loop below has |
270 | * been shown to hit this on both POWER6 and POWER7. |
271 | */ |
272 | .align 5 |
273 | 2: |
274 | adde r0,r0,r6 |
275 | source; ld r12,32(r3) |
276 | source; ld r14,40(r3) |
277 | |
278 | adde r0,r0,r9 |
279 | source; ld r15,48(r3) |
280 | source; ld r16,56(r3) |
281 | addi r3,r3,64 |
282 | |
283 | adde r0,r0,r10 |
284 | dest; std r6,0(r4) |
285 | dest; std r9,8(r4) |
286 | |
287 | adde r0,r0,r11 |
288 | dest; std r10,16(r4) |
289 | dest; std r11,24(r4) |
290 | |
291 | adde r0,r0,r12 |
292 | dest; std r12,32(r4) |
293 | dest; std r14,40(r4) |
294 | |
295 | adde r0,r0,r14 |
296 | dest; std r15,48(r4) |
297 | dest; std r16,56(r4) |
298 | addi r4,r4,64 |
299 | |
300 | adde r0,r0,r15 |
301 | source; ld r6,0(r3) |
302 | source; ld r9,8(r3) |
303 | |
304 | adde r0,r0,r16 |
305 | source; ld r10,16(r3) |
306 | source; ld r11,24(r3) |
307 | bdnz 2b |
308 | |
309 | |
310 | adde r0,r0,r6 |
311 | source; ld r12,32(r3) |
312 | source; ld r14,40(r3) |
313 | |
314 | adde r0,r0,r9 |
315 | source; ld r15,48(r3) |
316 | source; ld r16,56(r3) |
317 | addi r3,r3,64 |
318 | |
319 | adde r0,r0,r10 |
320 | dest; std r6,0(r4) |
321 | dest; std r9,8(r4) |
322 | |
323 | adde r0,r0,r11 |
324 | dest; std r10,16(r4) |
325 | dest; std r11,24(r4) |
326 | |
327 | adde r0,r0,r12 |
328 | dest; std r12,32(r4) |
329 | dest; std r14,40(r4) |
330 | |
331 | adde r0,r0,r14 |
332 | dest; std r15,48(r4) |
333 | dest; std r16,56(r4) |
334 | addi r4,r4,64 |
335 | |
336 | adde r0,r0,r15 |
337 | adde r0,r0,r16 |
338 | |
339 | ld r14,STK_REG(R14)(r1) |
340 | ld r15,STK_REG(R15)(r1) |
341 | ld r16,STK_REG(R16)(r1) |
342 | addi r1,r1,STACKFRAMESIZE |
343 | |
344 | andi. r5,r5,63 |
345 | |
346 | .Lcopy_tail_doublewords: /* Up to 127 bytes to go */ |
347 | srdi. r6,r5,3 |
348 | beq .Lcopy_tail_word |
349 | |
350 | mtctr r6 |
351 | 3: |
352 | srcnr; ld r6,0(r3) |
353 | addi r3,r3,8 |
354 | adde r0,r0,r6 |
355 | dstnr; std r6,0(r4) |
356 | addi r4,r4,8 |
357 | bdnz 3b |
358 | |
359 | andi. r5,r5,7 |
360 | |
361 | .Lcopy_tail_word: /* Up to 7 bytes to go */ |
362 | srdi. r6,r5,2 |
363 | beq .Lcopy_tail_halfword |
364 | |
365 | srcnr; lwz r6,0(r3) |
366 | addi r3,r3,4 |
367 | adde r0,r0,r6 |
368 | dstnr; stw r6,0(r4) |
369 | addi r4,r4,4 |
370 | subi r5,r5,4 |
371 | |
372 | .Lcopy_tail_halfword: /* Up to 3 bytes to go */ |
373 | srdi. r6,r5,1 |
374 | beq .Lcopy_tail_byte |
375 | |
376 | srcnr; lhz r6,0(r3) |
377 | addi r3,r3,2 |
378 | adde r0,r0,r6 |
379 | dstnr; sth r6,0(r4) |
380 | addi r4,r4,2 |
381 | subi r5,r5,2 |
382 | |
383 | .Lcopy_tail_byte: /* Up to 1 byte to go */ |
384 | andi. r6,r5,1 |
385 | beq .Lcopy_finish |
386 | |
387 | srcnr; lbz r6,0(r3) |
388 | #ifdef __BIG_ENDIAN__ |
389 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ |
390 | adde r0,r0,r9 |
391 | #else |
392 | adde r0,r0,r6 |
393 | #endif |
394 | dstnr; stb r6,0(r4) |
395 | |
396 | .Lcopy_finish: |
397 | addze r0,r0 /* add in final carry */ |
398 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ |
399 | add r3,r4,r0 |
400 | srdi r3,r3,32 |
401 | blr |
402 | |
403 | .Lerror: |
404 | ld r14,STK_REG(R14)(r1) |
405 | ld r15,STK_REG(R15)(r1) |
406 | ld r16,STK_REG(R16)(r1) |
407 | addi r1,r1,STACKFRAMESIZE |
408 | .Lerror_nr: |
409 | li r3,0 |
410 | blr |
411 | |
412 | EXPORT_SYMBOL(csum_partial_copy_generic) |
413 | |
414 | /* |
415 | * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, |
416 | * const struct in6_addr *daddr, |
417 | * __u32 len, __u8 proto, __wsum sum) |
418 | */ |
419 | |
420 | _GLOBAL(csum_ipv6_magic) |
421 | ld r8, 0(r3) |
422 | ld r9, 8(r3) |
423 | add r5, r5, r6 |
424 | addc r0, r8, r9 |
425 | ld r10, 0(r4) |
426 | ld r11, 8(r4) |
427 | #ifdef CONFIG_CPU_LITTLE_ENDIAN |
428 | rotldi r5, r5, 8 |
429 | #endif |
430 | adde r0, r0, r10 |
431 | add r5, r5, r7 |
432 | adde r0, r0, r11 |
433 | adde r0, r0, r5 |
434 | addze r0, r0 |
435 | rotldi r3, r0, 32 /* fold two 32 bit halves together */ |
436 | add r3, r0, r3 |
437 | srdi r0, r3, 32 |
438 | rotlwi r3, r0, 16 /* fold two 16 bit halves together */ |
439 | add r3, r0, r3 |
440 | not r3, r3 |
441 | rlwinm r3, r3, 16, 16, 31 |
442 | blr |
443 | EXPORT_SYMBOL(csum_ipv6_magic) |
444 | |