1 | /* |
2 | * This file is subject to the terms and conditions of the GNU General Public |
3 | * License. See the file "COPYING" in the main directory of this archive |
4 | * for more details. |
5 | * |
6 | * Quick'n'dirty IP checksum ... |
7 | * |
8 | * Copyright (C) 1998, 1999 Ralf Baechle |
9 | * Copyright (C) 1999 Silicon Graphics, Inc. |
10 | * Copyright (C) 2007 Maciej W. Rozycki |
11 | * Copyright (C) 2014 Imagination Technologies Ltd. |
12 | */ |
13 | #include <linux/errno.h> |
14 | #include <linux/export.h> |
15 | #include <asm/asm.h> |
16 | #include <asm/asm-offsets.h> |
17 | #include <asm/regdef.h> |
18 | |
19 | #ifdef CONFIG_64BIT |
20 | /* |
21 | * As we are sharing code base with the mips32 tree (which use the o32 ABI |
22 | * register definitions). We need to redefine the register definitions from |
23 | * the n64 ABI register naming to the o32 ABI register naming. |
24 | */ |
25 | #undef t0 |
26 | #undef t1 |
27 | #undef t2 |
28 | #undef t3 |
29 | #define t0 $8 |
30 | #define t1 $9 |
31 | #define t2 $10 |
32 | #define t3 $11 |
33 | #define t4 $12 |
34 | #define t5 $13 |
35 | #define t6 $14 |
36 | #define t7 $15 |
37 | |
38 | #define USE_DOUBLE |
39 | #endif |
40 | |
41 | #ifdef USE_DOUBLE |
42 | |
43 | #define LOAD ld |
44 | #define LOAD32 lwu |
45 | #define ADD daddu |
46 | #define NBYTES 8 |
47 | |
48 | #else |
49 | |
50 | #define LOAD lw |
51 | #define LOAD32 lw |
52 | #define ADD addu |
53 | #define NBYTES 4 |
54 | |
55 | #endif /* USE_DOUBLE */ |
56 | |
57 | #define UNIT(unit) ((unit)*NBYTES) |
58 | |
59 | #define ADDC(sum,reg) \ |
60 | .set push; \ |
61 | .set noat; \ |
62 | ADD sum, reg; \ |
63 | sltu v1, sum, reg; \ |
64 | ADD sum, v1; \ |
65 | .set pop |
66 | |
67 | #define ADDC32(sum,reg) \ |
68 | .set push; \ |
69 | .set noat; \ |
70 | addu sum, reg; \ |
71 | sltu v1, sum, reg; \ |
72 | addu sum, v1; \ |
73 | .set pop |
74 | |
75 | #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \ |
76 | LOAD _t0, (offset + UNIT(0))(src); \ |
77 | LOAD _t1, (offset + UNIT(1))(src); \ |
78 | LOAD _t2, (offset + UNIT(2))(src); \ |
79 | LOAD _t3, (offset + UNIT(3))(src); \ |
80 | ADDC(_t0, _t1); \ |
81 | ADDC(_t2, _t3); \ |
82 | ADDC(sum, _t0); \ |
83 | ADDC(sum, _t2) |
84 | |
85 | #ifdef USE_DOUBLE |
86 | #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \ |
87 | CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) |
88 | #else |
89 | #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \ |
90 | CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3); \ |
91 | CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3) |
92 | #endif |
93 | |
94 | /* |
95 | * a0: source address |
96 | * a1: length of the area to checksum |
97 | * a2: partial checksum |
98 | */ |
99 | |
100 | #define src a0 |
101 | #define sum v0 |
102 | |
103 | .text |
104 | .set noreorder |
105 | .align 5 |
106 | LEAF(csum_partial) |
107 | EXPORT_SYMBOL(csum_partial) |
108 | move sum, zero |
109 | move t7, zero |
110 | |
111 | sltiu t8, a1, 0x8 |
112 | bnez t8, .Lsmall_csumcpy /* < 8 bytes to copy */ |
113 | move t2, a1 |
114 | |
115 | andi t7, src, 0x1 /* odd buffer? */ |
116 | |
117 | .Lhword_align: |
118 | beqz t7, .Lword_align |
119 | andi t8, src, 0x2 |
120 | |
121 | lbu t0, (src) |
122 | LONG_SUBU a1, a1, 0x1 |
123 | #ifdef __MIPSEL__ |
124 | sll t0, t0, 8 |
125 | #endif |
126 | ADDC(sum, t0) |
127 | PTR_ADDU src, src, 0x1 |
128 | andi t8, src, 0x2 |
129 | |
130 | .Lword_align: |
131 | beqz t8, .Ldword_align |
132 | sltiu t8, a1, 56 |
133 | |
134 | lhu t0, (src) |
135 | LONG_SUBU a1, a1, 0x2 |
136 | ADDC(sum, t0) |
137 | sltiu t8, a1, 56 |
138 | PTR_ADDU src, src, 0x2 |
139 | |
140 | .Ldword_align: |
141 | bnez t8, .Ldo_end_words |
142 | move t8, a1 |
143 | |
144 | andi t8, src, 0x4 |
145 | beqz t8, .Lqword_align |
146 | andi t8, src, 0x8 |
147 | |
148 | LOAD32 t0, 0x00(src) |
149 | LONG_SUBU a1, a1, 0x4 |
150 | ADDC(sum, t0) |
151 | PTR_ADDU src, src, 0x4 |
152 | andi t8, src, 0x8 |
153 | |
154 | .Lqword_align: |
155 | beqz t8, .Loword_align |
156 | andi t8, src, 0x10 |
157 | |
158 | #ifdef USE_DOUBLE |
159 | ld t0, 0x00(src) |
160 | LONG_SUBU a1, a1, 0x8 |
161 | ADDC(sum, t0) |
162 | #else |
163 | lw t0, 0x00(src) |
164 | lw t1, 0x04(src) |
165 | LONG_SUBU a1, a1, 0x8 |
166 | ADDC(sum, t0) |
167 | ADDC(sum, t1) |
168 | #endif |
169 | PTR_ADDU src, src, 0x8 |
170 | andi t8, src, 0x10 |
171 | |
172 | .Loword_align: |
173 | beqz t8, .Lbegin_movement |
174 | LONG_SRL t8, a1, 0x7 |
175 | |
176 | #ifdef USE_DOUBLE |
177 | ld t0, 0x00(src) |
178 | ld t1, 0x08(src) |
179 | ADDC(sum, t0) |
180 | ADDC(sum, t1) |
181 | #else |
182 | CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4) |
183 | #endif |
184 | LONG_SUBU a1, a1, 0x10 |
185 | PTR_ADDU src, src, 0x10 |
186 | LONG_SRL t8, a1, 0x7 |
187 | |
188 | .Lbegin_movement: |
189 | beqz t8, 1f |
190 | andi t2, a1, 0x40 |
191 | |
192 | .Lmove_128bytes: |
193 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
194 | CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) |
195 | CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4) |
196 | CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4) |
197 | LONG_SUBU t8, t8, 0x01 |
198 | .set reorder /* DADDI_WAR */ |
199 | PTR_ADDU src, src, 0x80 |
200 | bnez t8, .Lmove_128bytes |
201 | .set noreorder |
202 | |
203 | 1: |
204 | beqz t2, 1f |
205 | andi t2, a1, 0x20 |
206 | |
207 | .Lmove_64bytes: |
208 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
209 | CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) |
210 | PTR_ADDU src, src, 0x40 |
211 | |
212 | 1: |
213 | beqz t2, .Ldo_end_words |
214 | andi t8, a1, 0x1c |
215 | |
216 | .Lmove_32bytes: |
217 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
218 | andi t8, a1, 0x1c |
219 | PTR_ADDU src, src, 0x20 |
220 | |
221 | .Ldo_end_words: |
222 | beqz t8, .Lsmall_csumcpy |
223 | andi t2, a1, 0x3 |
224 | LONG_SRL t8, t8, 0x2 |
225 | |
226 | .Lend_words: |
227 | LOAD32 t0, (src) |
228 | LONG_SUBU t8, t8, 0x1 |
229 | ADDC(sum, t0) |
230 | .set reorder /* DADDI_WAR */ |
231 | PTR_ADDU src, src, 0x4 |
232 | bnez t8, .Lend_words |
233 | .set noreorder |
234 | |
235 | /* unknown src alignment and < 8 bytes to go */ |
236 | .Lsmall_csumcpy: |
237 | move a1, t2 |
238 | |
239 | andi t0, a1, 4 |
240 | beqz t0, 1f |
241 | andi t0, a1, 2 |
242 | |
243 | /* Still a full word to go */ |
244 | ulw t1, (src) |
245 | PTR_ADDIU src, 4 |
246 | #ifdef USE_DOUBLE |
247 | dsll t1, t1, 32 /* clear lower 32bit */ |
248 | #endif |
249 | ADDC(sum, t1) |
250 | |
251 | 1: move t1, zero |
252 | beqz t0, 1f |
253 | andi t0, a1, 1 |
254 | |
255 | /* Still a halfword to go */ |
256 | ulhu t1, (src) |
257 | PTR_ADDIU src, 2 |
258 | |
259 | 1: beqz t0, 1f |
260 | sll t1, t1, 16 |
261 | |
262 | lbu t2, (src) |
263 | nop |
264 | |
265 | #ifdef __MIPSEB__ |
266 | sll t2, t2, 8 |
267 | #endif |
268 | or t1, t2 |
269 | |
270 | 1: ADDC(sum, t1) |
271 | |
272 | /* fold checksum */ |
273 | #ifdef USE_DOUBLE |
274 | dsll32 v1, sum, 0 |
275 | daddu sum, v1 |
276 | sltu v1, sum, v1 |
277 | dsra32 sum, sum, 0 |
278 | addu sum, v1 |
279 | #endif |
280 | |
281 | /* odd buffer alignment? */ |
282 | #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \ |
283 | defined(CONFIG_CPU_LOONGSON64) |
284 | .set push |
285 | .set arch=mips32r2 |
286 | wsbh v1, sum |
287 | movn sum, v1, t7 |
288 | .set pop |
289 | #else |
290 | beqz t7, 1f /* odd buffer alignment? */ |
291 | lui v1, 0x00ff |
292 | addu v1, 0x00ff |
293 | and t0, sum, v1 |
294 | sll t0, t0, 8 |
295 | srl sum, sum, 8 |
296 | and sum, sum, v1 |
297 | or sum, sum, t0 |
298 | 1: |
299 | #endif |
300 | .set reorder |
301 | /* Add the passed partial csum. */ |
302 | ADDC32(sum, a2) |
303 | jr ra |
304 | .set noreorder |
305 | END(csum_partial) |
306 | |
307 | |
308 | /* |
309 | * checksum and copy routines based on memcpy.S |
310 | * |
311 | * csum_partial_copy_nocheck(src, dst, len) |
312 | * __csum_partial_copy_kernel(src, dst, len) |
313 | * |
314 | * See "Spec" in memcpy.S for details. Unlike __copy_user, all |
315 | * function in this file use the standard calling convention. |
316 | */ |
317 | |
318 | #define src a0 |
319 | #define dst a1 |
320 | #define len a2 |
321 | #define sum v0 |
322 | #define odd t8 |
323 | |
324 | /* |
325 | * All exception handlers simply return 0. |
326 | */ |
327 | |
328 | /* Instruction type */ |
329 | #define LD_INSN 1 |
330 | #define ST_INSN 2 |
331 | #define LEGACY_MODE 1 |
332 | #define EVA_MODE 2 |
333 | #define USEROP 1 |
334 | #define KERNELOP 2 |
335 | |
336 | /* |
337 | * Wrapper to add an entry in the exception table |
338 | * in case the insn causes a memory exception. |
339 | * Arguments: |
340 | * insn : Load/store instruction |
341 | * type : Instruction type |
342 | * reg : Register |
343 | * addr : Address |
344 | * handler : Exception handler |
345 | */ |
346 | #define EXC(insn, type, reg, addr) \ |
347 | .if \mode == LEGACY_MODE; \ |
348 | 9: insn reg, addr; \ |
349 | .section __ex_table,"a"; \ |
350 | PTR_WD 9b, .L_exc; \ |
351 | .previous; \ |
352 | /* This is enabled in EVA mode */ \ |
353 | .else; \ |
354 | /* If loading from user or storing to user */ \ |
355 | .if ((\from == USEROP) && (type == LD_INSN)) || \ |
356 | ((\to == USEROP) && (type == ST_INSN)); \ |
357 | 9: __BUILD_EVA_INSN(insn##e, reg, addr); \ |
358 | .section __ex_table,"a"; \ |
359 | PTR_WD 9b, .L_exc; \ |
360 | .previous; \ |
361 | .else; \ |
362 | /* EVA without exception */ \ |
363 | insn reg, addr; \ |
364 | .endif; \ |
365 | .endif |
366 | |
367 | #undef LOAD |
368 | |
369 | #ifdef USE_DOUBLE |
370 | |
371 | #define LOADK ld /* No exception */ |
372 | #define LOAD(reg, addr) EXC(ld, LD_INSN, reg, addr) |
373 | #define LOADBU(reg, addr) EXC(lbu, LD_INSN, reg, addr) |
374 | #define LOADL(reg, addr) EXC(ldl, LD_INSN, reg, addr) |
375 | #define LOADR(reg, addr) EXC(ldr, LD_INSN, reg, addr) |
376 | #define STOREB(reg, addr) EXC(sb, ST_INSN, reg, addr) |
377 | #define STOREL(reg, addr) EXC(sdl, ST_INSN, reg, addr) |
378 | #define STORER(reg, addr) EXC(sdr, ST_INSN, reg, addr) |
379 | #define STORE(reg, addr) EXC(sd, ST_INSN, reg, addr) |
380 | #define ADD daddu |
381 | #define SUB dsubu |
382 | #define SRL dsrl |
383 | #define SLL dsll |
384 | #define SLLV dsllv |
385 | #define SRLV dsrlv |
386 | #define NBYTES 8 |
387 | #define LOG_NBYTES 3 |
388 | |
389 | #else |
390 | |
391 | #define LOADK lw /* No exception */ |
392 | #define LOAD(reg, addr) EXC(lw, LD_INSN, reg, addr) |
393 | #define LOADBU(reg, addr) EXC(lbu, LD_INSN, reg, addr) |
394 | #define LOADL(reg, addr) EXC(lwl, LD_INSN, reg, addr) |
395 | #define LOADR(reg, addr) EXC(lwr, LD_INSN, reg, addr) |
396 | #define STOREB(reg, addr) EXC(sb, ST_INSN, reg, addr) |
397 | #define STOREL(reg, addr) EXC(swl, ST_INSN, reg, addr) |
398 | #define STORER(reg, addr) EXC(swr, ST_INSN, reg, addr) |
399 | #define STORE(reg, addr) EXC(sw, ST_INSN, reg, addr) |
400 | #define ADD addu |
401 | #define SUB subu |
402 | #define SRL srl |
403 | #define SLL sll |
404 | #define SLLV sllv |
405 | #define SRLV srlv |
406 | #define NBYTES 4 |
407 | #define LOG_NBYTES 2 |
408 | |
409 | #endif /* USE_DOUBLE */ |
410 | |
411 | #ifdef CONFIG_CPU_LITTLE_ENDIAN |
412 | #define LDFIRST LOADR |
413 | #define LDREST LOADL |
414 | #define STFIRST STORER |
415 | #define STREST STOREL |
416 | #define SHIFT_DISCARD SLLV |
417 | #define SHIFT_DISCARD_REVERT SRLV |
418 | #else |
419 | #define LDFIRST LOADL |
420 | #define LDREST LOADR |
421 | #define STFIRST STOREL |
422 | #define STREST STORER |
423 | #define SHIFT_DISCARD SRLV |
424 | #define SHIFT_DISCARD_REVERT SLLV |
425 | #endif |
426 | |
427 | #define FIRST(unit) ((unit)*NBYTES) |
428 | #define REST(unit) (FIRST(unit)+NBYTES-1) |
429 | |
430 | #define ADDRMASK (NBYTES-1) |
431 | |
432 | #ifndef CONFIG_CPU_DADDI_WORKAROUNDS |
433 | .set noat |
434 | #else |
435 | .set at=v1 |
436 | #endif |
437 | |
438 | .macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to |
439 | |
440 | li sum, -1 |
441 | move odd, zero |
442 | /* |
443 | * Note: dst & src may be unaligned, len may be 0 |
444 | * Temps |
445 | */ |
446 | /* |
447 | * The "issue break"s below are very approximate. |
448 | * Issue delays for dcache fills will perturb the schedule, as will |
449 | * load queue full replay traps, etc. |
450 | * |
451 | * If len < NBYTES use byte operations. |
452 | */ |
453 | sltu t2, len, NBYTES |
454 | and t1, dst, ADDRMASK |
455 | bnez t2, .Lcopy_bytes_checklen\@ |
456 | and t0, src, ADDRMASK |
457 | andi odd, dst, 0x1 /* odd buffer? */ |
458 | bnez t1, .Ldst_unaligned\@ |
459 | nop |
460 | bnez t0, .Lsrc_unaligned_dst_aligned\@ |
461 | /* |
462 | * use delay slot for fall-through |
463 | * src and dst are aligned; need to compute rem |
464 | */ |
465 | .Lboth_aligned\@: |
466 | SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter |
467 | beqz t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES |
468 | nop |
469 | SUB len, 8*NBYTES # subtract here for bgez loop |
470 | .align 4 |
471 | 1: |
472 | LOAD(t0, UNIT(0)(src)) |
473 | LOAD(t1, UNIT(1)(src)) |
474 | LOAD(t2, UNIT(2)(src)) |
475 | LOAD(t3, UNIT(3)(src)) |
476 | LOAD(t4, UNIT(4)(src)) |
477 | LOAD(t5, UNIT(5)(src)) |
478 | LOAD(t6, UNIT(6)(src)) |
479 | LOAD(t7, UNIT(7)(src)) |
480 | SUB len, len, 8*NBYTES |
481 | ADD src, src, 8*NBYTES |
482 | STORE(t0, UNIT(0)(dst)) |
483 | ADDC(t0, t1) |
484 | STORE(t1, UNIT(1)(dst)) |
485 | ADDC(sum, t0) |
486 | STORE(t2, UNIT(2)(dst)) |
487 | ADDC(t2, t3) |
488 | STORE(t3, UNIT(3)(dst)) |
489 | ADDC(sum, t2) |
490 | STORE(t4, UNIT(4)(dst)) |
491 | ADDC(t4, t5) |
492 | STORE(t5, UNIT(5)(dst)) |
493 | ADDC(sum, t4) |
494 | STORE(t6, UNIT(6)(dst)) |
495 | ADDC(t6, t7) |
496 | STORE(t7, UNIT(7)(dst)) |
497 | ADDC(sum, t6) |
498 | .set reorder /* DADDI_WAR */ |
499 | ADD dst, dst, 8*NBYTES |
500 | bgez len, 1b |
501 | .set noreorder |
502 | ADD len, 8*NBYTES # revert len (see above) |
503 | |
504 | /* |
505 | * len == the number of bytes left to copy < 8*NBYTES |
506 | */ |
507 | .Lcleanup_both_aligned\@: |
508 | #define rem t7 |
509 | beqz len, .Ldone\@ |
510 | sltu t0, len, 4*NBYTES |
511 | bnez t0, .Lless_than_4units\@ |
512 | and rem, len, (NBYTES-1) # rem = len % NBYTES |
513 | /* |
514 | * len >= 4*NBYTES |
515 | */ |
516 | LOAD(t0, UNIT(0)(src)) |
517 | LOAD(t1, UNIT(1)(src)) |
518 | LOAD(t2, UNIT(2)(src)) |
519 | LOAD(t3, UNIT(3)(src)) |
520 | SUB len, len, 4*NBYTES |
521 | ADD src, src, 4*NBYTES |
522 | STORE(t0, UNIT(0)(dst)) |
523 | ADDC(t0, t1) |
524 | STORE(t1, UNIT(1)(dst)) |
525 | ADDC(sum, t0) |
526 | STORE(t2, UNIT(2)(dst)) |
527 | ADDC(t2, t3) |
528 | STORE(t3, UNIT(3)(dst)) |
529 | ADDC(sum, t2) |
530 | .set reorder /* DADDI_WAR */ |
531 | ADD dst, dst, 4*NBYTES |
532 | beqz len, .Ldone\@ |
533 | .set noreorder |
534 | .Lless_than_4units\@: |
535 | /* |
536 | * rem = len % NBYTES |
537 | */ |
538 | beq rem, len, .Lcopy_bytes\@ |
539 | nop |
540 | 1: |
541 | LOAD(t0, 0(src)) |
542 | ADD src, src, NBYTES |
543 | SUB len, len, NBYTES |
544 | STORE(t0, 0(dst)) |
545 | ADDC(sum, t0) |
546 | .set reorder /* DADDI_WAR */ |
547 | ADD dst, dst, NBYTES |
548 | bne rem, len, 1b |
549 | .set noreorder |
550 | |
551 | /* |
552 | * src and dst are aligned, need to copy rem bytes (rem < NBYTES) |
553 | * A loop would do only a byte at a time with possible branch |
554 | * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE |
555 | * because can't assume read-access to dst. Instead, use |
556 | * STREST dst, which doesn't require read access to dst. |
557 | * |
558 | * This code should perform better than a simple loop on modern, |
559 | * wide-issue mips processors because the code has fewer branches and |
560 | * more instruction-level parallelism. |
561 | */ |
562 | #define bits t2 |
563 | beqz len, .Ldone\@ |
564 | ADD t1, dst, len # t1 is just past last byte of dst |
565 | li bits, 8*NBYTES |
566 | SLL rem, len, 3 # rem = number of bits to keep |
567 | LOAD(t0, 0(src)) |
568 | SUB bits, bits, rem # bits = number of bits to discard |
569 | SHIFT_DISCARD t0, t0, bits |
570 | STREST(t0, -1(t1)) |
571 | SHIFT_DISCARD_REVERT t0, t0, bits |
572 | .set reorder |
573 | ADDC(sum, t0) |
574 | b .Ldone\@ |
575 | .set noreorder |
576 | .Ldst_unaligned\@: |
577 | /* |
578 | * dst is unaligned |
579 | * t0 = src & ADDRMASK |
580 | * t1 = dst & ADDRMASK; T1 > 0 |
581 | * len >= NBYTES |
582 | * |
583 | * Copy enough bytes to align dst |
584 | * Set match = (src and dst have same alignment) |
585 | */ |
586 | #define match rem |
587 | LDFIRST(t3, FIRST(0)(src)) |
588 | ADD t2, zero, NBYTES |
589 | LDREST(t3, REST(0)(src)) |
590 | SUB t2, t2, t1 # t2 = number of bytes copied |
591 | xor match, t0, t1 |
592 | STFIRST(t3, FIRST(0)(dst)) |
593 | SLL t4, t1, 3 # t4 = number of bits to discard |
594 | SHIFT_DISCARD t3, t3, t4 |
595 | /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */ |
596 | ADDC(sum, t3) |
597 | beq len, t2, .Ldone\@ |
598 | SUB len, len, t2 |
599 | ADD dst, dst, t2 |
600 | beqz match, .Lboth_aligned\@ |
601 | ADD src, src, t2 |
602 | |
603 | .Lsrc_unaligned_dst_aligned\@: |
604 | SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter |
605 | beqz t0, .Lcleanup_src_unaligned\@ |
606 | and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES |
607 | 1: |
608 | /* |
609 | * Avoid consecutive LD*'s to the same register since some mips |
610 | * implementations can't issue them in the same cycle. |
611 | * It's OK to load FIRST(N+1) before REST(N) because the two addresses |
612 | * are to the same unit (unless src is aligned, but it's not). |
613 | */ |
614 | LDFIRST(t0, FIRST(0)(src)) |
615 | LDFIRST(t1, FIRST(1)(src)) |
616 | SUB len, len, 4*NBYTES |
617 | LDREST(t0, REST(0)(src)) |
618 | LDREST(t1, REST(1)(src)) |
619 | LDFIRST(t2, FIRST(2)(src)) |
620 | LDFIRST(t3, FIRST(3)(src)) |
621 | LDREST(t2, REST(2)(src)) |
622 | LDREST(t3, REST(3)(src)) |
623 | ADD src, src, 4*NBYTES |
624 | #ifdef CONFIG_CPU_SB1 |
625 | nop # improves slotting |
626 | #endif |
627 | STORE(t0, UNIT(0)(dst)) |
628 | ADDC(t0, t1) |
629 | STORE(t1, UNIT(1)(dst)) |
630 | ADDC(sum, t0) |
631 | STORE(t2, UNIT(2)(dst)) |
632 | ADDC(t2, t3) |
633 | STORE(t3, UNIT(3)(dst)) |
634 | ADDC(sum, t2) |
635 | .set reorder /* DADDI_WAR */ |
636 | ADD dst, dst, 4*NBYTES |
637 | bne len, rem, 1b |
638 | .set noreorder |
639 | |
640 | .Lcleanup_src_unaligned\@: |
641 | beqz len, .Ldone\@ |
642 | and rem, len, NBYTES-1 # rem = len % NBYTES |
643 | beq rem, len, .Lcopy_bytes\@ |
644 | nop |
645 | 1: |
646 | LDFIRST(t0, FIRST(0)(src)) |
647 | LDREST(t0, REST(0)(src)) |
648 | ADD src, src, NBYTES |
649 | SUB len, len, NBYTES |
650 | STORE(t0, 0(dst)) |
651 | ADDC(sum, t0) |
652 | .set reorder /* DADDI_WAR */ |
653 | ADD dst, dst, NBYTES |
654 | bne len, rem, 1b |
655 | .set noreorder |
656 | |
657 | .Lcopy_bytes_checklen\@: |
658 | beqz len, .Ldone\@ |
659 | nop |
660 | .Lcopy_bytes\@: |
661 | /* 0 < len < NBYTES */ |
662 | #ifdef CONFIG_CPU_LITTLE_ENDIAN |
663 | #define SHIFT_START 0 |
664 | #define SHIFT_INC 8 |
665 | #else |
666 | #define SHIFT_START 8*(NBYTES-1) |
667 | #define SHIFT_INC -8 |
668 | #endif |
669 | move t2, zero # partial word |
670 | li t3, SHIFT_START # shift |
671 | #define COPY_BYTE(N) \ |
672 | LOADBU(t0, N(src)); \ |
673 | SUB len, len, 1; \ |
674 | STOREB(t0, N(dst)); \ |
675 | SLLV t0, t0, t3; \ |
676 | addu t3, SHIFT_INC; \ |
677 | beqz len, .Lcopy_bytes_done\@; \ |
678 | or t2, t0 |
679 | |
680 | COPY_BYTE(0) |
681 | COPY_BYTE(1) |
682 | #ifdef USE_DOUBLE |
683 | COPY_BYTE(2) |
684 | COPY_BYTE(3) |
685 | COPY_BYTE(4) |
686 | COPY_BYTE(5) |
687 | #endif |
688 | LOADBU(t0, NBYTES-2(src)) |
689 | SUB len, len, 1 |
690 | STOREB(t0, NBYTES-2(dst)) |
691 | SLLV t0, t0, t3 |
692 | or t2, t0 |
693 | .Lcopy_bytes_done\@: |
694 | ADDC(sum, t2) |
695 | .Ldone\@: |
696 | /* fold checksum */ |
697 | .set push |
698 | .set noat |
699 | #ifdef USE_DOUBLE |
700 | dsll32 v1, sum, 0 |
701 | daddu sum, v1 |
702 | sltu v1, sum, v1 |
703 | dsra32 sum, sum, 0 |
704 | addu sum, v1 |
705 | #endif |
706 | |
707 | #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \ |
708 | defined(CONFIG_CPU_LOONGSON64) |
709 | .set push |
710 | .set arch=mips32r2 |
711 | wsbh v1, sum |
712 | movn sum, v1, odd |
713 | .set pop |
714 | #else |
715 | beqz odd, 1f /* odd buffer alignment? */ |
716 | lui v1, 0x00ff |
717 | addu v1, 0x00ff |
718 | and t0, sum, v1 |
719 | sll t0, t0, 8 |
720 | srl sum, sum, 8 |
721 | and sum, sum, v1 |
722 | or sum, sum, t0 |
723 | 1: |
724 | #endif |
725 | .set pop |
726 | .set reorder |
727 | jr ra |
728 | .set noreorder |
729 | .endm |
730 | |
731 | .set noreorder |
732 | .L_exc: |
733 | jr ra |
734 | li v0, 0 |
735 | |
736 | FEXPORT(__csum_partial_copy_nocheck) |
737 | EXPORT_SYMBOL(__csum_partial_copy_nocheck) |
738 | #ifndef CONFIG_EVA |
739 | FEXPORT(__csum_partial_copy_to_user) |
740 | EXPORT_SYMBOL(__csum_partial_copy_to_user) |
741 | FEXPORT(__csum_partial_copy_from_user) |
742 | EXPORT_SYMBOL(__csum_partial_copy_from_user) |
743 | #endif |
744 | __BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP |
745 | |
746 | #ifdef CONFIG_EVA |
747 | LEAF(__csum_partial_copy_to_user) |
748 | __BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP |
749 | END(__csum_partial_copy_to_user) |
750 | |
751 | LEAF(__csum_partial_copy_from_user) |
752 | __BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP |
753 | END(__csum_partial_copy_from_user) |
754 | #endif |
755 | |