1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * linux/arch/arm/lib/csumpartialcopygeneric.S |
4 | * |
5 | * Copyright (C) 1995-2001 Russell King |
6 | */ |
7 | #include <asm/assembler.h> |
8 | |
9 | /* |
10 | * unsigned int |
11 | * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, ) |
12 | * r0 = src, r1 = dst, r2 = len, r3 = sum |
13 | * Returns : r0 = checksum |
14 | * |
15 | * Note that 'tst' and 'teq' preserve the carry flag. |
16 | */ |
17 | |
18 | src .req r0 |
19 | dst .req r1 |
20 | len .req r2 |
21 | sum .req r3 |
22 | |
23 | .Lzero: mov r0, sum |
24 | load_regs |
25 | |
26 | /* |
27 | * Align an unaligned destination pointer. We know that |
28 | * we have >= 8 bytes here, so we don't need to check |
29 | * the length. Note that the source pointer hasn't been |
30 | * aligned yet. |
31 | */ |
32 | .Ldst_unaligned: |
33 | tst dst, #1 |
34 | beq .Ldst_16bit |
35 | |
36 | load1b ip |
37 | sub len, len, #1 |
38 | adcs sum, sum, ip, put_byte_1 @ update checksum |
39 | strb ip, [dst], #1 |
40 | tst dst, #2 |
41 | reteq lr @ dst is now 32bit aligned |
42 | |
43 | .Ldst_16bit: load2b r8, ip |
44 | sub len, len, #2 |
45 | adcs sum, sum, r8, put_byte_0 |
46 | strb r8, [dst], #1 |
47 | adcs sum, sum, ip, put_byte_1 |
48 | strb ip, [dst], #1 |
49 | ret lr @ dst is now 32bit aligned |
50 | |
51 | /* |
52 | * Handle 0 to 7 bytes, with any alignment of source and |
53 | * destination pointers. Note that when we get here, C = 0 |
54 | */ |
55 | .Lless8: teq len, #0 @ check for zero count |
56 | beq .Lzero |
57 | |
58 | /* we must have at least one byte. */ |
59 | tst dst, #1 @ dst 16-bit aligned |
60 | beq .Lless8_aligned |
61 | |
62 | /* Align dst */ |
63 | load1b ip |
64 | sub len, len, #1 |
65 | adcs sum, sum, ip, put_byte_1 @ update checksum |
66 | strb ip, [dst], #1 |
67 | tst len, #6 |
68 | beq .Lless8_byteonly |
69 | |
70 | 1: load2b r8, ip |
71 | sub len, len, #2 |
72 | adcs sum, sum, r8, put_byte_0 |
73 | strb r8, [dst], #1 |
74 | adcs sum, sum, ip, put_byte_1 |
75 | strb ip, [dst], #1 |
76 | .Lless8_aligned: |
77 | tst len, #6 |
78 | bne 1b |
79 | .Lless8_byteonly: |
80 | tst len, #1 |
81 | beq .Ldone |
82 | load1b r8 |
83 | adcs sum, sum, r8, put_byte_0 @ update checksum |
84 | strb r8, [dst], #1 |
85 | b .Ldone |
86 | |
87 | FN_ENTRY |
88 | save_regs |
89 | mov sum, #-1 |
90 | |
91 | cmp len, #8 @ Ensure that we have at least |
92 | blo .Lless8 @ 8 bytes to copy. |
93 | |
94 | adds sum, sum, #0 @ C = 0 |
95 | tst dst, #3 @ Test destination alignment |
96 | blne .Ldst_unaligned @ align destination, return here |
97 | |
98 | /* |
99 | * Ok, the dst pointer is now 32bit aligned, and we know |
100 | * that we must have more than 4 bytes to copy. Note |
101 | * that C contains the carry from the dst alignment above. |
102 | */ |
103 | |
104 | tst src, #3 @ Test source alignment |
105 | bne .Lsrc_not_aligned |
106 | |
107 | /* Routine for src & dst aligned */ |
108 | |
109 | bics ip, len, #15 |
110 | beq 2f |
111 | |
112 | 1: load4l r4, r5, r6, r7 |
113 | stmia dst!, {r4, r5, r6, r7} |
114 | adcs sum, sum, r4 |
115 | adcs sum, sum, r5 |
116 | adcs sum, sum, r6 |
117 | adcs sum, sum, r7 |
118 | sub ip, ip, #16 |
119 | teq ip, #0 |
120 | bne 1b |
121 | |
122 | 2: ands ip, len, #12 |
123 | beq 4f |
124 | tst ip, #8 |
125 | beq 3f |
126 | load2l r4, r5 |
127 | stmia dst!, {r4, r5} |
128 | adcs sum, sum, r4 |
129 | adcs sum, sum, r5 |
130 | tst ip, #4 |
131 | beq 4f |
132 | |
133 | 3: load1l r4 |
134 | str r4, [dst], #4 |
135 | adcs sum, sum, r4 |
136 | |
137 | 4: ands len, len, #3 |
138 | beq .Ldone |
139 | load1l r4 |
140 | tst len, #2 |
141 | mov r5, r4, get_byte_0 |
142 | beq .Lexit |
143 | adcs sum, sum, r4, lspush #16 |
144 | strb r5, [dst], #1 |
145 | mov r5, r4, get_byte_1 |
146 | strb r5, [dst], #1 |
147 | mov r5, r4, get_byte_2 |
148 | .Lexit: tst len, #1 |
149 | strbne r5, [dst], #1 |
150 | andne r5, r5, #255 |
151 | adcsne sum, sum, r5, put_byte_0 |
152 | |
153 | /* |
154 | * If the dst pointer was not 16-bit aligned, we |
155 | * need to rotate the checksum here to get around |
156 | * the inefficient byte manipulations in the |
157 | * architecture independent code. |
158 | */ |
159 | .Ldone: adc r0, sum, #0 |
160 | ldr sum, [sp, #0] @ dst |
161 | tst sum, #1 |
162 | movne r0, r0, ror #8 |
163 | load_regs |
164 | |
165 | .Lsrc_not_aligned: |
166 | adc sum, sum, #0 @ include C from dst alignment |
167 | and ip, src, #3 |
168 | bic src, src, #3 |
169 | load1l r5 |
170 | cmp ip, #2 |
171 | beq .Lsrc2_aligned |
172 | bhi .Lsrc3_aligned |
173 | mov r4, r5, lspull #8 @ C = 0 |
174 | bics ip, len, #15 |
175 | beq 2f |
176 | 1: load4l r5, r6, r7, r8 |
177 | orr r4, r4, r5, lspush #24 |
178 | mov r5, r5, lspull #8 |
179 | orr r5, r5, r6, lspush #24 |
180 | mov r6, r6, lspull #8 |
181 | orr r6, r6, r7, lspush #24 |
182 | mov r7, r7, lspull #8 |
183 | orr r7, r7, r8, lspush #24 |
184 | stmia dst!, {r4, r5, r6, r7} |
185 | adcs sum, sum, r4 |
186 | adcs sum, sum, r5 |
187 | adcs sum, sum, r6 |
188 | adcs sum, sum, r7 |
189 | mov r4, r8, lspull #8 |
190 | sub ip, ip, #16 |
191 | teq ip, #0 |
192 | bne 1b |
193 | 2: ands ip, len, #12 |
194 | beq 4f |
195 | tst ip, #8 |
196 | beq 3f |
197 | load2l r5, r6 |
198 | orr r4, r4, r5, lspush #24 |
199 | mov r5, r5, lspull #8 |
200 | orr r5, r5, r6, lspush #24 |
201 | stmia dst!, {r4, r5} |
202 | adcs sum, sum, r4 |
203 | adcs sum, sum, r5 |
204 | mov r4, r6, lspull #8 |
205 | tst ip, #4 |
206 | beq 4f |
207 | 3: load1l r5 |
208 | orr r4, r4, r5, lspush #24 |
209 | str r4, [dst], #4 |
210 | adcs sum, sum, r4 |
211 | mov r4, r5, lspull #8 |
212 | 4: ands len, len, #3 |
213 | beq .Ldone |
214 | mov r5, r4, get_byte_0 |
215 | tst len, #2 |
216 | beq .Lexit |
217 | adcs sum, sum, r4, lspush #16 |
218 | strb r5, [dst], #1 |
219 | mov r5, r4, get_byte_1 |
220 | strb r5, [dst], #1 |
221 | mov r5, r4, get_byte_2 |
222 | b .Lexit |
223 | |
224 | .Lsrc2_aligned: mov r4, r5, lspull #16 |
225 | adds sum, sum, #0 |
226 | bics ip, len, #15 |
227 | beq 2f |
228 | 1: load4l r5, r6, r7, r8 |
229 | orr r4, r4, r5, lspush #16 |
230 | mov r5, r5, lspull #16 |
231 | orr r5, r5, r6, lspush #16 |
232 | mov r6, r6, lspull #16 |
233 | orr r6, r6, r7, lspush #16 |
234 | mov r7, r7, lspull #16 |
235 | orr r7, r7, r8, lspush #16 |
236 | stmia dst!, {r4, r5, r6, r7} |
237 | adcs sum, sum, r4 |
238 | adcs sum, sum, r5 |
239 | adcs sum, sum, r6 |
240 | adcs sum, sum, r7 |
241 | mov r4, r8, lspull #16 |
242 | sub ip, ip, #16 |
243 | teq ip, #0 |
244 | bne 1b |
245 | 2: ands ip, len, #12 |
246 | beq 4f |
247 | tst ip, #8 |
248 | beq 3f |
249 | load2l r5, r6 |
250 | orr r4, r4, r5, lspush #16 |
251 | mov r5, r5, lspull #16 |
252 | orr r5, r5, r6, lspush #16 |
253 | stmia dst!, {r4, r5} |
254 | adcs sum, sum, r4 |
255 | adcs sum, sum, r5 |
256 | mov r4, r6, lspull #16 |
257 | tst ip, #4 |
258 | beq 4f |
259 | 3: load1l r5 |
260 | orr r4, r4, r5, lspush #16 |
261 | str r4, [dst], #4 |
262 | adcs sum, sum, r4 |
263 | mov r4, r5, lspull #16 |
264 | 4: ands len, len, #3 |
265 | beq .Ldone |
266 | mov r5, r4, get_byte_0 |
267 | tst len, #2 |
268 | beq .Lexit |
269 | adcs sum, sum, r4 |
270 | strb r5, [dst], #1 |
271 | mov r5, r4, get_byte_1 |
272 | strb r5, [dst], #1 |
273 | tst len, #1 |
274 | beq .Ldone |
275 | load1b r5 |
276 | b .Lexit |
277 | |
278 | .Lsrc3_aligned: mov r4, r5, lspull #24 |
279 | adds sum, sum, #0 |
280 | bics ip, len, #15 |
281 | beq 2f |
282 | 1: load4l r5, r6, r7, r8 |
283 | orr r4, r4, r5, lspush #8 |
284 | mov r5, r5, lspull #24 |
285 | orr r5, r5, r6, lspush #8 |
286 | mov r6, r6, lspull #24 |
287 | orr r6, r6, r7, lspush #8 |
288 | mov r7, r7, lspull #24 |
289 | orr r7, r7, r8, lspush #8 |
290 | stmia dst!, {r4, r5, r6, r7} |
291 | adcs sum, sum, r4 |
292 | adcs sum, sum, r5 |
293 | adcs sum, sum, r6 |
294 | adcs sum, sum, r7 |
295 | mov r4, r8, lspull #24 |
296 | sub ip, ip, #16 |
297 | teq ip, #0 |
298 | bne 1b |
299 | 2: ands ip, len, #12 |
300 | beq 4f |
301 | tst ip, #8 |
302 | beq 3f |
303 | load2l r5, r6 |
304 | orr r4, r4, r5, lspush #8 |
305 | mov r5, r5, lspull #24 |
306 | orr r5, r5, r6, lspush #8 |
307 | stmia dst!, {r4, r5} |
308 | adcs sum, sum, r4 |
309 | adcs sum, sum, r5 |
310 | mov r4, r6, lspull #24 |
311 | tst ip, #4 |
312 | beq 4f |
313 | 3: load1l r5 |
314 | orr r4, r4, r5, lspush #8 |
315 | str r4, [dst], #4 |
316 | adcs sum, sum, r4 |
317 | mov r4, r5, lspull #24 |
318 | 4: ands len, len, #3 |
319 | beq .Ldone |
320 | mov r5, r4, get_byte_0 |
321 | tst len, #2 |
322 | beq .Lexit |
323 | strb r5, [dst], #1 |
324 | adcs sum, sum, r4 |
325 | load1l r4 |
326 | mov r5, r4, get_byte_0 |
327 | strb r5, [dst], #1 |
328 | adcs sum, sum, r4, lspush #24 |
329 | mov r5, r4, get_byte_1 |
330 | b .Lexit |
331 | FN_EXIT |
332 | |