1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * This file contains assembly-language implementations |
4 | * of IP-style 1's complement checksum routines. |
5 | * |
6 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) |
7 | * |
8 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). |
9 | */ |
10 | |
11 | #include <linux/export.h> |
12 | #include <linux/sys.h> |
13 | #include <asm/processor.h> |
14 | #include <asm/cache.h> |
15 | #include <asm/errno.h> |
16 | #include <asm/ppc_asm.h> |
17 | |
18 | .text |
19 | |
20 | /* |
21 | * computes the checksum of a memory block at buff, length len, |
22 | * and adds in "sum" (32-bit) |
23 | * |
24 | * __csum_partial(buff, len, sum) |
25 | */ |
26 | _GLOBAL(__csum_partial) |
27 | subi r3,r3,4 |
28 | srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ |
29 | beq 3f /* if we're doing < 4 bytes */ |
30 | andi. r0,r3,2 /* Align buffer to longword boundary */ |
31 | beq+ 1f |
32 | lhz r0,4(r3) /* do 2 bytes to get aligned */ |
33 | subi r4,r4,2 |
34 | addi r3,r3,2 |
35 | srwi. r6,r4,2 /* # words to do */ |
36 | adde r5,r5,r0 |
37 | beq 3f |
38 | 1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ |
39 | beq 21f |
40 | mtctr r6 |
41 | 2: lwzu r0,4(r3) |
42 | adde r5,r5,r0 |
43 | bdnz 2b |
44 | 21: srwi. r6,r4,4 /* # blocks of 4 words to do */ |
45 | beq 3f |
46 | lwz r0,4(r3) |
47 | mtctr r6 |
48 | lwz r6,8(r3) |
49 | adde r5,r5,r0 |
50 | lwz r7,12(r3) |
51 | adde r5,r5,r6 |
52 | lwzu r8,16(r3) |
53 | adde r5,r5,r7 |
54 | bdz 23f |
55 | 22: lwz r0,4(r3) |
56 | adde r5,r5,r8 |
57 | lwz r6,8(r3) |
58 | adde r5,r5,r0 |
59 | lwz r7,12(r3) |
60 | adde r5,r5,r6 |
61 | lwzu r8,16(r3) |
62 | adde r5,r5,r7 |
63 | bdnz 22b |
64 | 23: adde r5,r5,r8 |
65 | 3: andi. r0,r4,2 |
66 | beq+ 4f |
67 | lhz r0,4(r3) |
68 | addi r3,r3,2 |
69 | adde r5,r5,r0 |
70 | 4: andi. r0,r4,1 |
71 | beq+ 5f |
72 | lbz r0,4(r3) |
73 | slwi r0,r0,8 /* Upper byte of word */ |
74 | adde r5,r5,r0 |
75 | 5: addze r3,r5 /* add in final carry */ |
76 | blr |
77 | EXPORT_SYMBOL(__csum_partial) |
78 | |
79 | /* |
80 | * Computes the checksum of a memory block at src, length len, |
81 | * and adds in 0xffffffff, while copying the block to dst. |
82 | * If an access exception occurs it returns zero. |
83 | * |
84 | * csum_partial_copy_generic(src, dst, len) |
85 | */ |
86 | #define CSUM_COPY_16_BYTES_WITHEX(n) \ |
87 | 8 ## n ## 0: \ |
88 | lwz r7,4(r4); \ |
89 | 8 ## n ## 1: \ |
90 | lwz r8,8(r4); \ |
91 | 8 ## n ## 2: \ |
92 | lwz r9,12(r4); \ |
93 | 8 ## n ## 3: \ |
94 | lwzu r10,16(r4); \ |
95 | 8 ## n ## 4: \ |
96 | stw r7,4(r6); \ |
97 | adde r12,r12,r7; \ |
98 | 8 ## n ## 5: \ |
99 | stw r8,8(r6); \ |
100 | adde r12,r12,r8; \ |
101 | 8 ## n ## 6: \ |
102 | stw r9,12(r6); \ |
103 | adde r12,r12,r9; \ |
104 | 8 ## n ## 7: \ |
105 | stwu r10,16(r6); \ |
106 | adde r12,r12,r10 |
107 | |
108 | #define CSUM_COPY_16_BYTES_EXCODE(n) \ |
109 | EX_TABLE(8 ## n ## 0b, fault); \ |
110 | EX_TABLE(8 ## n ## 1b, fault); \ |
111 | EX_TABLE(8 ## n ## 2b, fault); \ |
112 | EX_TABLE(8 ## n ## 3b, fault); \ |
113 | EX_TABLE(8 ## n ## 4b, fault); \ |
114 | EX_TABLE(8 ## n ## 5b, fault); \ |
115 | EX_TABLE(8 ## n ## 6b, fault); \ |
116 | EX_TABLE(8 ## n ## 7b, fault); |
117 | |
118 | .text |
119 | |
120 | CACHELINE_BYTES = L1_CACHE_BYTES |
121 | LG_CACHELINE_BYTES = L1_CACHE_SHIFT |
122 | CACHELINE_MASK = (L1_CACHE_BYTES-1) |
123 | |
124 | _GLOBAL(csum_partial_copy_generic) |
125 | li r12,-1 |
126 | addic r0,r0,0 /* clear carry */ |
127 | addi r6,r4,-4 |
128 | neg r0,r4 |
129 | addi r4,r3,-4 |
130 | andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ |
131 | crset 4*cr7+eq |
132 | beq 58f |
133 | |
134 | cmplw 0,r5,r0 /* is this more than total to do? */ |
135 | blt 63f /* if not much to do */ |
136 | rlwinm r7,r6,3,0x8 |
137 | rlwnm r12,r12,r7,0,31 /* odd destination address: rotate one byte */ |
138 | cmplwi cr7,r7,0 /* is destination address even ? */ |
139 | andi. r8,r0,3 /* get it word-aligned first */ |
140 | mtctr r8 |
141 | beq+ 61f |
142 | li r3,0 |
143 | 70: lbz r9,4(r4) /* do some bytes */ |
144 | addi r4,r4,1 |
145 | slwi r3,r3,8 |
146 | rlwimi r3,r9,0,24,31 |
147 | 71: stb r9,4(r6) |
148 | addi r6,r6,1 |
149 | bdnz 70b |
150 | adde r12,r12,r3 |
151 | 61: subf r5,r0,r5 |
152 | srwi. r0,r0,2 |
153 | mtctr r0 |
154 | beq 58f |
155 | 72: lwzu r9,4(r4) /* do some words */ |
156 | adde r12,r12,r9 |
157 | 73: stwu r9,4(r6) |
158 | bdnz 72b |
159 | |
160 | 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ |
161 | clrlwi r5,r5,32-LG_CACHELINE_BYTES |
162 | li r11,4 |
163 | beq 63f |
164 | |
165 | /* Here we decide how far ahead to prefetch the source */ |
166 | li r3,4 |
167 | cmpwi r0,1 |
168 | li r7,0 |
169 | ble 114f |
170 | li r7,1 |
171 | #if MAX_COPY_PREFETCH > 1 |
172 | /* Heuristically, for large transfers we prefetch |
173 | MAX_COPY_PREFETCH cachelines ahead. For small transfers |
174 | we prefetch 1 cacheline ahead. */ |
175 | cmpwi r0,MAX_COPY_PREFETCH |
176 | ble 112f |
177 | li r7,MAX_COPY_PREFETCH |
178 | 112: mtctr r7 |
179 | 111: dcbt r3,r4 |
180 | addi r3,r3,CACHELINE_BYTES |
181 | bdnz 111b |
182 | #else |
183 | dcbt r3,r4 |
184 | addi r3,r3,CACHELINE_BYTES |
185 | #endif /* MAX_COPY_PREFETCH > 1 */ |
186 | |
187 | 114: subf r8,r7,r0 |
188 | mr r0,r7 |
189 | mtctr r8 |
190 | |
191 | 53: dcbt r3,r4 |
192 | 54: dcbz r11,r6 |
193 | /* the main body of the cacheline loop */ |
194 | CSUM_COPY_16_BYTES_WITHEX(0) |
195 | #if L1_CACHE_BYTES >= 32 |
196 | CSUM_COPY_16_BYTES_WITHEX(1) |
197 | #if L1_CACHE_BYTES >= 64 |
198 | CSUM_COPY_16_BYTES_WITHEX(2) |
199 | CSUM_COPY_16_BYTES_WITHEX(3) |
200 | #if L1_CACHE_BYTES >= 128 |
201 | CSUM_COPY_16_BYTES_WITHEX(4) |
202 | CSUM_COPY_16_BYTES_WITHEX(5) |
203 | CSUM_COPY_16_BYTES_WITHEX(6) |
204 | CSUM_COPY_16_BYTES_WITHEX(7) |
205 | #endif |
206 | #endif |
207 | #endif |
208 | bdnz 53b |
209 | cmpwi r0,0 |
210 | li r3,4 |
211 | li r7,0 |
212 | bne 114b |
213 | |
214 | 63: srwi. r0,r5,2 |
215 | mtctr r0 |
216 | beq 64f |
217 | 30: lwzu r0,4(r4) |
218 | adde r12,r12,r0 |
219 | 31: stwu r0,4(r6) |
220 | bdnz 30b |
221 | |
222 | 64: andi. r0,r5,2 |
223 | beq+ 65f |
224 | 40: lhz r0,4(r4) |
225 | addi r4,r4,2 |
226 | 41: sth r0,4(r6) |
227 | adde r12,r12,r0 |
228 | addi r6,r6,2 |
229 | 65: andi. r0,r5,1 |
230 | beq+ 66f |
231 | 50: lbz r0,4(r4) |
232 | 51: stb r0,4(r6) |
233 | slwi r0,r0,8 |
234 | adde r12,r12,r0 |
235 | 66: addze r3,r12 |
236 | beqlr+ cr7 |
237 | rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ |
238 | blr |
239 | |
240 | fault: |
241 | li r3,0 |
242 | blr |
243 | |
244 | EX_TABLE(70b, fault); |
245 | EX_TABLE(71b, fault); |
246 | EX_TABLE(72b, fault); |
247 | EX_TABLE(73b, fault); |
248 | EX_TABLE(54b, fault); |
249 | |
250 | /* |
251 | * this stuff handles faults in the cacheline loop and branches to either |
252 | * fault (if in read part) or fault (if in write part) |
253 | */ |
254 | CSUM_COPY_16_BYTES_EXCODE(0) |
255 | #if L1_CACHE_BYTES >= 32 |
256 | CSUM_COPY_16_BYTES_EXCODE(1) |
257 | #if L1_CACHE_BYTES >= 64 |
258 | CSUM_COPY_16_BYTES_EXCODE(2) |
259 | CSUM_COPY_16_BYTES_EXCODE(3) |
260 | #if L1_CACHE_BYTES >= 128 |
261 | CSUM_COPY_16_BYTES_EXCODE(4) |
262 | CSUM_COPY_16_BYTES_EXCODE(5) |
263 | CSUM_COPY_16_BYTES_EXCODE(6) |
264 | CSUM_COPY_16_BYTES_EXCODE(7) |
265 | #endif |
266 | #endif |
267 | #endif |
268 | |
269 | EX_TABLE(30b, fault); |
270 | EX_TABLE(31b, fault); |
271 | EX_TABLE(40b, fault); |
272 | EX_TABLE(41b, fault); |
273 | EX_TABLE(50b, fault); |
274 | EX_TABLE(51b, fault); |
275 | |
276 | EXPORT_SYMBOL(csum_partial_copy_generic) |
277 | |
278 | /* |
279 | * __sum16 csum_ipv6_magic(const struct in6_addr *saddr, |
280 | * const struct in6_addr *daddr, |
281 | * __u32 len, __u8 proto, __wsum sum) |
282 | */ |
283 | |
284 | _GLOBAL(csum_ipv6_magic) |
285 | lwz r8, 0(r3) |
286 | lwz r9, 4(r3) |
287 | addc r0, r7, r8 |
288 | lwz r10, 8(r3) |
289 | adde r0, r0, r9 |
290 | lwz r11, 12(r3) |
291 | adde r0, r0, r10 |
292 | lwz r8, 0(r4) |
293 | adde r0, r0, r11 |
294 | lwz r9, 4(r4) |
295 | adde r0, r0, r8 |
296 | lwz r10, 8(r4) |
297 | adde r0, r0, r9 |
298 | lwz r11, 12(r4) |
299 | adde r0, r0, r10 |
300 | add r5, r5, r6 /* assumption: len + proto doesn't carry */ |
301 | adde r0, r0, r11 |
302 | adde r0, r0, r5 |
303 | addze r0, r0 |
304 | rotlwi r3, r0, 16 |
305 | add r3, r0, r3 |
306 | not r3, r3 |
307 | rlwinm r3, r3, 16, 16, 31 |
308 | blr |
309 | EXPORT_SYMBOL(csum_ipv6_magic) |
310 | |