csum_partial.S source code [linux/arch/mips/lib/csum_partial.S]

1	/*
2	* This file is subject to the terms and conditions of the GNU General Public
3	* License. See the file "COPYING" in the main directory of this archive
4	* for more details.
5	*
6	* Quick'n'dirty IP checksum ...
7	*
8	* Copyright (C) 1998, 1999 Ralf Baechle
9	* Copyright (C) 1999 Silicon Graphics, Inc.
10	* Copyright (C) 2007 Maciej W. Rozycki
11	* Copyright (C) 2014 Imagination Technologies Ltd.
12	*/
13	#include <linux/errno.h>
14	#include <linux/export.h>
15	#include <asm/asm.h>
16	#include <asm/asm-offsets.h>
17	#include <asm/regdef.h>
18
19	#ifdef CONFIG_64BIT
20	/*
21	* As we are sharing code base with the mips32 tree (which use the o32 ABI
22	* register definitions). We need to redefine the register definitions from
23	* the n64 ABI register naming to the o32 ABI register naming.
24	*/
25	#undef t0
26	#undef t1
27	#undef t2
28	#undef t3
29	#define t0 $8
30	#define t1 $9
31	#define t2 $10
32	#define t3 $11
33	#define t4 $12
34	#define t5 $13
35	#define t6 $14
36	#define t7 $15
37
38	#define USE_DOUBLE
39	#endif
40
41	#ifdef USE_DOUBLE
42
43	#define LOAD ld
44	#define LOAD32 lwu
45	#define ADD daddu
46	#define NBYTES 8
47
48	#else
49
50	#define LOAD lw
51	#define LOAD32 lw
52	#define ADD addu
53	#define NBYTES 4
54
55	#endif /* USE_DOUBLE */
56
57	#define UNIT(unit) ((unit)*NBYTES)
58
59	#define ADDC(sum,reg) \
60	.set push; \
61	.set noat; \
62	ADD sum, reg; \
63	sltu v1, sum, reg; \
64	ADD sum, v1; \
65	.set pop
66
67	#define ADDC32(sum,reg) \
68	.set push; \
69	.set noat; \
70	addu sum, reg; \
71	sltu v1, sum, reg; \
72	addu sum, v1; \
73	.set pop
74
75	#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \
76	LOAD _t0, (offset + UNIT(0))(src); \
77	LOAD _t1, (offset + UNIT(1))(src); \
78	LOAD _t2, (offset + UNIT(2))(src); \
79	LOAD _t3, (offset + UNIT(3))(src); \
80	ADDC(_t0, _t1); \
81	ADDC(_t2, _t3); \
82	ADDC(sum, _t0); \
83	ADDC(sum, _t2)
84
85	#ifdef USE_DOUBLE
86	#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
87	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
88	#else
89	#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
90	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3); \
91	CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
92	#endif
93
94	/*
95	* a0: source address
96	* a1: length of the area to checksum
97	* a2: partial checksum
98	*/
99
100	#define src a0
101	#define sum v0
102
103	.text
104	.set noreorder
105	.align `5`
106	LEAF(csum_partial)
107	EXPORT_SYMBOL(csum_partial)
108	move sum, zero
109	move t7, zero
110
111	sltiu t8, a1, `0x8`
112	bnez t8, .Lsmall_csumcpy / < 8 bytes to copy /
113	move t2, a1
114
115	andi t7, src, `0x1` / odd buffer? /
116
117	.Lhword_align:
118	beqz t7, .Lword_align
119	andi t8, src, `0x2`
120
121	lbu t0, (src)
122	LONG_SUBU a1, a1, `0x1`
123	#ifdef __MIPSEL__
124	sll t0, t0, `8`
125	#endif
126	ADDC(sum, t0)
127	PTR_ADDU src, src, `0x1`
128	andi t8, src, `0x2`
129
130	.Lword_align:
131	beqz t8, .Ldword_align
132	sltiu t8, a1, `56`
133
134	lhu t0, (src)
135	LONG_SUBU a1, a1, `0x2`
136	ADDC(sum, t0)
137	sltiu t8, a1, `56`
138	PTR_ADDU src, src, `0x2`
139
140	.Ldword_align:
141	bnez t8, .Ldo_end_words
142	move t8, a1
143
144	andi t8, src, `0x4`
145	beqz t8, .Lqword_align
146	andi t8, src, `0x8`
147
148	LOAD32 t0, `0x00`(src)
149	LONG_SUBU a1, a1, `0x4`
150	ADDC(sum, t0)
151	PTR_ADDU src, src, `0x4`
152	andi t8, src, `0x8`
153
154	.Lqword_align:
155	beqz t8, .Loword_align
156	andi t8, src, `0x10`
157
158	#ifdef USE_DOUBLE
159	ld t0, `0x00`(src)
160	LONG_SUBU a1, a1, `0x8`
161	ADDC(sum, t0)
162	#else
163	lw t0, `0x00`(src)
164	lw t1, `0x04`(src)
165	LONG_SUBU a1, a1, `0x8`
166	ADDC(sum, t0)
167	ADDC(sum, t1)
168	#endif
169	PTR_ADDU src, src, `0x8`
170	andi t8, src, `0x10`
171
172	.Loword_align:
173	beqz t8, .Lbegin_movement
174	LONG_SRL t8, a1, `0x7`
175
176	#ifdef USE_DOUBLE
177	ld t0, `0x00`(src)
178	ld t1, `0x08`(src)
179	ADDC(sum, t0)
180	ADDC(sum, t1)
181	#else
182	CSUM_BIGCHUNK1(src, `0x00`, sum, t0, t1, t3, t4)
183	#endif
184	LONG_SUBU a1, a1, `0x10`
185	PTR_ADDU src, src, `0x10`
186	LONG_SRL t8, a1, `0x7`
187
188	.Lbegin_movement:
189	beqz t8, `1f`
190	andi t2, a1, `0x40`
191
192	.Lmove_128bytes:
193	CSUM_BIGCHUNK(src, `0x00`, sum, t0, t1, t3, t4)
194	CSUM_BIGCHUNK(src, `0x20`, sum, t0, t1, t3, t4)
195	CSUM_BIGCHUNK(src, `0x40`, sum, t0, t1, t3, t4)
196	CSUM_BIGCHUNK(src, `0x60`, sum, t0, t1, t3, t4)
197	LONG_SUBU t8, t8, `0x01`
198	.set reorder / DADDI_WAR /
199	PTR_ADDU src, src, `0x80`
200	bnez t8, .Lmove_128bytes
201	.set noreorder
202
203	`1`:
204	beqz t2, `1f`
205	andi t2, a1, `0x20`
206
207	.Lmove_64bytes:
208	CSUM_BIGCHUNK(src, `0x00`, sum, t0, t1, t3, t4)
209	CSUM_BIGCHUNK(src, `0x20`, sum, t0, t1, t3, t4)
210	PTR_ADDU src, src, `0x40`
211
212	`1`:
213	beqz t2, .Ldo_end_words
214	andi t8, a1, `0x1c`
215
216	.Lmove_32bytes:
217	CSUM_BIGCHUNK(src, `0x00`, sum, t0, t1, t3, t4)
218	andi t8, a1, `0x1c`
219	PTR_ADDU src, src, `0x20`
220
221	.Ldo_end_words:
222	beqz t8, .Lsmall_csumcpy
223	andi t2, a1, `0x3`
224	LONG_SRL t8, t8, `0x2`
225
226	.Lend_words:
227	LOAD32 t0, (src)
228	LONG_SUBU t8, t8, `0x1`
229	ADDC(sum, t0)
230	.set reorder / DADDI_WAR /
231	PTR_ADDU src, src, `0x4`
232	bnez t8, .Lend_words
233	.set noreorder
234
235	/ unknown src alignment and < 8 bytes to go /
236	.Lsmall_csumcpy:
237	move a1, t2
238
239	andi t0, a1, `4`
240	beqz t0, `1f`
241	andi t0, a1, `2`
242
243	/ Still a full word to go /
244	ulw t1, (src)
245	PTR_ADDIU src, `4`
246	#ifdef USE_DOUBLE
247	dsll t1, t1, `32` / clear lower 32bit /
248	#endif
249	ADDC(sum, t1)
250
251	`1`: move t1, zero
252	beqz t0, `1f`
253	andi t0, a1, `1`
254
255	/ Still a halfword to go /
256	ulhu t1, (src)
257	PTR_ADDIU src, `2`
258
259	`1`: beqz t0, `1f`
260	sll t1, t1, `16`
261
262	lbu t2, (src)
263	nop
264
265	#ifdef __MIPSEB__
266	sll t2, t2, `8`
267	#endif
268	or t1, t2
269
270	`1`: ADDC(sum, t1)
271
272	/ fold checksum /
273	#ifdef USE_DOUBLE
274	dsll32 v1, sum, `0`
275	daddu sum, v1
276	sltu v1, sum, v1
277	dsra32 sum, sum, `0`
278	addu sum, v1
279	#endif
280
281	/ odd buffer alignment? /
282	#if defined(CONFIG_CPU_MIPSR2) \|\| defined(CONFIG_CPU_MIPSR5) \|\| \
283	defined(CONFIG_CPU_LOONGSON64)
284	.set push
285	.set arch=mips32r2
286	wsbh v1, sum
287	movn sum, v1, t7
288	.set pop
289	#else
290	beqz t7, `1f` / odd buffer alignment? /
291	lui v1, `0x00ff`
292	addu v1, `0x00ff`
293	and t0, sum, v1
294	sll t0, t0, `8`
295	srl sum, sum, `8`
296	and sum, sum, v1
297	or sum, sum, t0
298	`1`:
299	#endif
300	.set reorder
301	/ Add the passed partial csum. /
302	ADDC32(sum, a2)
303	jr ra
304	.set noreorder
305	END(csum_partial)
306
307
308	/*
309	* checksum and copy routines based on memcpy.S
310	*
311	* csum_partial_copy_nocheck(src, dst, len)
312	* __csum_partial_copy_kernel(src, dst, len)
313	*
314	* See "Spec" in memcpy.S for details. Unlike __copy_user, all
315	* function in this file use the standard calling convention.
316	*/
317
318	#define src a0
319	#define dst a1
320	#define len a2
321	#define sum v0
322	#define odd t8
323
324	/*
325	* All exception handlers simply return 0.
326	*/
327
328	/ Instruction type /
329	#define LD_INSN 1
330	#define ST_INSN 2
331	#define LEGACY_MODE 1
332	#define EVA_MODE 2
333	#define USEROP 1
334	#define KERNELOP 2
335
336	/*
337	* Wrapper to add an entry in the exception table
338	* in case the insn causes a memory exception.
339	* Arguments:
340	* insn : Load/store instruction
341	* type : Instruction type
342	* reg : Register
343	* addr : Address
344	* handler : Exception handler
345	*/
346	#define EXC(insn, type, reg, addr) \
347	.if \mode == LEGACY_MODE; \
348	9: insn reg, addr; \
349	.section __ex_table,"a"; \
350	PTR_WD 9b, .L_exc; \
351	.previous; \
352	/* This is enabled in EVA mode */ \
353	.else; \
354	/* If loading from user or storing to user */ \
355	.if ((\from == USEROP) && (type == LD_INSN)) \|\| \
356	((\to == USEROP) && (type == ST_INSN)); \
357	9: __BUILD_EVA_INSN(insn##e, reg, addr); \
358	.section __ex_table,"a"; \
359	PTR_WD 9b, .L_exc; \
360	.previous; \
361	.else; \
362	/* EVA without exception */ \
363	insn reg, addr; \
364	.endif; \
365	.endif
366
367	#undef LOAD
368
369	#ifdef USE_DOUBLE
370
371	#define LOADK ld /* No exception */
372	#define LOAD(reg, addr) EXC(ld, LD_INSN, reg, addr)
373	#define LOADBU(reg, addr) EXC(lbu, LD_INSN, reg, addr)
374	#define LOADL(reg, addr) EXC(ldl, LD_INSN, reg, addr)
375	#define LOADR(reg, addr) EXC(ldr, LD_INSN, reg, addr)
376	#define STOREB(reg, addr) EXC(sb, ST_INSN, reg, addr)
377	#define STOREL(reg, addr) EXC(sdl, ST_INSN, reg, addr)
378	#define STORER(reg, addr) EXC(sdr, ST_INSN, reg, addr)
379	#define STORE(reg, addr) EXC(sd, ST_INSN, reg, addr)
380	#define ADD daddu
381	#define SUB dsubu
382	#define SRL dsrl
383	#define SLL dsll
384	#define SLLV dsllv
385	#define SRLV dsrlv
386	#define NBYTES 8
387	#define LOG_NBYTES 3
388
389	#else
390
391	#define LOADK lw /* No exception */
392	#define LOAD(reg, addr) EXC(lw, LD_INSN, reg, addr)
393	#define LOADBU(reg, addr) EXC(lbu, LD_INSN, reg, addr)
394	#define LOADL(reg, addr) EXC(lwl, LD_INSN, reg, addr)
395	#define LOADR(reg, addr) EXC(lwr, LD_INSN, reg, addr)
396	#define STOREB(reg, addr) EXC(sb, ST_INSN, reg, addr)
397	#define STOREL(reg, addr) EXC(swl, ST_INSN, reg, addr)
398	#define STORER(reg, addr) EXC(swr, ST_INSN, reg, addr)
399	#define STORE(reg, addr) EXC(sw, ST_INSN, reg, addr)
400	#define ADD addu
401	#define SUB subu
402	#define SRL srl
403	#define SLL sll
404	#define SLLV sllv
405	#define SRLV srlv
406	#define NBYTES 4
407	#define LOG_NBYTES 2
408
409	#endif /* USE_DOUBLE */
410
411	#ifdef CONFIG_CPU_LITTLE_ENDIAN
412	#define LDFIRST LOADR
413	#define LDREST LOADL
414	#define STFIRST STORER
415	#define STREST STOREL
416	#define SHIFT_DISCARD SLLV
417	#define SHIFT_DISCARD_REVERT SRLV
418	#else
419	#define LDFIRST LOADL
420	#define LDREST LOADR
421	#define STFIRST STOREL
422	#define STREST STORER
423	#define SHIFT_DISCARD SRLV
424	#define SHIFT_DISCARD_REVERT SLLV
425	#endif
426
427	#define FIRST(unit) ((unit)*NBYTES)
428	#define REST(unit) (FIRST(unit)+NBYTES-1)
429
430	#define ADDRMASK (NBYTES-1)
431
432	#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
433	.set noat
434	#else
435	.set at=v1
436	#endif
437
438	.macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to
439
440	li sum, -`1`
441	move odd, zero
442	/*
443	* Note: dst & src may be unaligned, len may be 0
444	* Temps
445	*/
446	/*
447	* The "issue break"s below are very approximate.
448	* Issue delays for dcache fills will perturb the schedule, as will
449	* load queue full replay traps, etc.
450	*
451	* If len < NBYTES use byte operations.
452	*/
453	sltu t2, len, NBYTES
454	and t1, dst, ADDRMASK
455	bnez t2, .Lcopy_bytes_checklen\@
456	and t0, src, ADDRMASK
457	andi odd, dst, `0x1` / odd buffer? /
458	bnez t1, .Ldst_unaligned\@
459	nop
460	bnez t0, .Lsrc_unaligned_dst_aligned\@
461	/*
462	* use delay slot for fall-through
463	* src and dst are aligned; need to compute rem
464	*/
465	.Lboth_aligned\@:
466	SRL t0, len, LOG_NBYTES+`3` # +`3` for `8` units/iter
467	beqz t0, .Lcleanup_both_aligned\@ # len < `8`*NBYTES
468	nop
469	SUB len, `8`NBYTES # subtract here for* bgez loop
470	.align `4`
471	`1`:
472	LOAD(t0, UNIT(`0`)(src))
473	LOAD(t1, UNIT(`1`)(src))
474	LOAD(t2, UNIT(`2`)(src))
475	LOAD(t3, UNIT(`3`)(src))
476	LOAD(t4, UNIT(`4`)(src))
477	LOAD(t5, UNIT(`5`)(src))
478	LOAD(t6, UNIT(`6`)(src))
479	LOAD(t7, UNIT(`7`)(src))
480	SUB len, len, `8`*NBYTES
481	ADD src, src, `8`*NBYTES
482	STORE(t0, UNIT(`0`)(dst))
483	ADDC(t0, t1)
484	STORE(t1, UNIT(`1`)(dst))
485	ADDC(sum, t0)
486	STORE(t2, UNIT(`2`)(dst))
487	ADDC(t2, t3)
488	STORE(t3, UNIT(`3`)(dst))
489	ADDC(sum, t2)
490	STORE(t4, UNIT(`4`)(dst))
491	ADDC(t4, t5)
492	STORE(t5, UNIT(`5`)(dst))
493	ADDC(sum, t4)
494	STORE(t6, UNIT(`6`)(dst))
495	ADDC(t6, t7)
496	STORE(t7, UNIT(`7`)(dst))
497	ADDC(sum, t6)
498	.set reorder / DADDI_WAR /
499	ADD dst, dst, `8`*NBYTES
500	bgez len, `1b`
501	.set noreorder
502	ADD len, `8`*NBYTES # revert len (see above)
503
504	/*
505	* len == the number of bytes left to copy < 8*NBYTES
506	*/
507	.Lcleanup_both_aligned\@:
508	#define rem t7
509	beqz len, .Ldone\@
510	sltu t0, len, `4`*NBYTES
511	bnez t0, .Lless_than_4units\@
512	and rem, len, (NBYTES-`1`) # rem = len % NBYTES
513	/*
514	* len >= 4*NBYTES
515	*/
516	LOAD(t0, UNIT(`0`)(src))
517	LOAD(t1, UNIT(`1`)(src))
518	LOAD(t2, UNIT(`2`)(src))
519	LOAD(t3, UNIT(`3`)(src))
520	SUB len, len, `4`*NBYTES
521	ADD src, src, `4`*NBYTES
522	STORE(t0, UNIT(`0`)(dst))
523	ADDC(t0, t1)
524	STORE(t1, UNIT(`1`)(dst))
525	ADDC(sum, t0)
526	STORE(t2, UNIT(`2`)(dst))
527	ADDC(t2, t3)
528	STORE(t3, UNIT(`3`)(dst))
529	ADDC(sum, t2)
530	.set reorder / DADDI_WAR /
531	ADD dst, dst, `4`*NBYTES
532	beqz len, .Ldone\@
533	.set noreorder
534	.Lless_than_4units\@:
535	/*
536	* rem = len % NBYTES
537	*/
538	beq rem, len, .Lcopy_bytes\@
539	nop
540	`1`:
541	LOAD(t0, `0`(src))
542	ADD src, src, NBYTES
543	SUB len, len, NBYTES
544	STORE(t0, `0`(dst))
545	ADDC(sum, t0)
546	.set reorder / DADDI_WAR /
547	ADD dst, dst, NBYTES
548	bne rem, len, `1b`
549	.set noreorder
550
551	/*
552	* src and dst are aligned, need to copy rem bytes (rem < NBYTES)
553	* A loop would do only a byte at a time with possible branch
554	* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
555	* because can't assume read-access to dst. Instead, use
556	* STREST dst, which doesn't require read access to dst.
557	*
558	* This code should perform better than a simple loop on modern,
559	* wide-issue mips processors because the code has fewer branches and
560	* more instruction-level parallelism.
561	*/
562	#define bits t2
563	beqz len, .Ldone\@
564	ADD t1, dst, len # t1 is just past last byte of dst
565	li bits, `8`*NBYTES
566	SLL rem, len, `3` # rem = number of bits to keep
567	LOAD(t0, `0`(src))
568	SUB bits, bits, rem # bits = number of bits to discard
569	SHIFT_DISCARD t0, t0, bits
570	STREST(t0, -`1`(t1))
571	SHIFT_DISCARD_REVERT t0, t0, bits
572	.set reorder
573	ADDC(sum, t0)
574	b .Ldone\@
575	.set noreorder
576	.Ldst_unaligned\@:
577	/*
578	* dst is unaligned
579	* t0 = src & ADDRMASK
580	* t1 = dst & ADDRMASK; T1 > 0
581	* len >= NBYTES
582	*
583	* Copy enough bytes to align dst
584	* Set match = (src and dst have same alignment)
585	*/
586	#define match rem
587	LDFIRST(t3, FIRST(`0`)(src))
588	ADD t2, zero, NBYTES
589	LDREST(t3, REST(`0`)(src))
590	SUB t2, t2, t1 # t2 = number of bytes copied
591	xor match, t0, t1
592	STFIRST(t3, FIRST(`0`)(dst))
593	SLL t4, t1, `3` # t4 = number of bits to discard
594	SHIFT_DISCARD t3, t3, t4
595	/ no SHIFT_DISCARD_REVERT to handle odd buffer properly /
596	ADDC(sum, t3)
597	beq len, t2, .Ldone\@
598	SUB len, len, t2
599	ADD dst, dst, t2
600	beqz match, .Lboth_aligned\@
601	ADD src, src, t2
602
603	.Lsrc_unaligned_dst_aligned\@:
604	SRL t0, len, LOG_NBYTES+`2` # +`2` for `4` units/iter
605	beqz t0, .Lcleanup_src_unaligned\@
606	and rem, len, (`4`NBYTES-`1`) # rem = len % `4`NBYTES
607	`1`:
608	/*
609	* Avoid consecutive LD*'s to the same register since some mips
610	* implementations can't issue them in the same cycle.
611	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
612	* are to the same unit (unless src is aligned, but it's not).
613	*/
614	LDFIRST(t0, FIRST(`0`)(src))
615	LDFIRST(t1, FIRST(`1`)(src))
616	SUB len, len, `4`*NBYTES
617	LDREST(t0, REST(`0`)(src))
618	LDREST(t1, REST(`1`)(src))
619	LDFIRST(t2, FIRST(`2`)(src))
620	LDFIRST(t3, FIRST(`3`)(src))
621	LDREST(t2, REST(`2`)(src))
622	LDREST(t3, REST(`3`)(src))
623	ADD src, src, `4`*NBYTES
624	#ifdef CONFIG_CPU_SB1
625	nop # improves slotting
626	#endif
627	STORE(t0, UNIT(`0`)(dst))
628	ADDC(t0, t1)
629	STORE(t1, UNIT(`1`)(dst))
630	ADDC(sum, t0)
631	STORE(t2, UNIT(`2`)(dst))
632	ADDC(t2, t3)
633	STORE(t3, UNIT(`3`)(dst))
634	ADDC(sum, t2)
635	.set reorder / DADDI_WAR /
636	ADD dst, dst, `4`*NBYTES
637	bne len, rem, `1b`
638	.set noreorder
639
640	.Lcleanup_src_unaligned\@:
641	beqz len, .Ldone\@
642	and rem, len, NBYTES-`1` # rem = len % NBYTES
643	beq rem, len, .Lcopy_bytes\@
644	nop
645	`1`:
646	LDFIRST(t0, FIRST(`0`)(src))
647	LDREST(t0, REST(`0`)(src))
648	ADD src, src, NBYTES
649	SUB len, len, NBYTES
650	STORE(t0, `0`(dst))
651	ADDC(sum, t0)
652	.set reorder / DADDI_WAR /
653	ADD dst, dst, NBYTES
654	bne len, rem, `1b`
655	.set noreorder
656
657	.Lcopy_bytes_checklen\@:
658	beqz len, .Ldone\@
659	nop
660	.Lcopy_bytes\@:
661	/ 0 < len < NBYTES /
662	#ifdef CONFIG_CPU_LITTLE_ENDIAN
663	#define SHIFT_START 0
664	#define SHIFT_INC 8
665	#else
666	#define SHIFT_START 8*(NBYTES-1)
667	#define SHIFT_INC -8
668	#endif
669	move t2, zero # partial word
670	li t3, SHIFT_START # shift
671	#define COPY_BYTE(N) \
672	LOADBU(t0, N(src)); \
673	SUB len, len, 1; \
674	STOREB(t0, N(dst)); \
675	SLLV t0, t0, t3; \
676	addu t3, SHIFT_INC; \
677	beqz len, .Lcopy_bytes_done\@; \
678	or t2, t0
679
680	COPY_BYTE(`0`)
681	COPY_BYTE(`1`)
682	#ifdef USE_DOUBLE
683	COPY_BYTE(`2`)
684	COPY_BYTE(`3`)
685	COPY_BYTE(`4`)
686	COPY_BYTE(`5`)
687	#endif
688	LOADBU(t0, NBYTES-`2`(src))
689	SUB len, len, `1`
690	STOREB(t0, NBYTES-`2`(dst))
691	SLLV t0, t0, t3
692	or t2, t0
693	.Lcopy_bytes_done\@:
694	ADDC(sum, t2)
695	.Ldone\@:
696	/ fold checksum /
697	.set push
698	.set noat
699	#ifdef USE_DOUBLE
700	dsll32 v1, sum, `0`
701	daddu sum, v1
702	sltu v1, sum, v1
703	dsra32 sum, sum, `0`
704	addu sum, v1
705	#endif
706
707	#if defined(CONFIG_CPU_MIPSR2) \|\| defined(CONFIG_CPU_MIPSR5) \|\| \
708	defined(CONFIG_CPU_LOONGSON64)
709	.set push
710	.set arch=mips32r2
711	wsbh v1, sum
712	movn sum, v1, odd
713	.set pop
714	#else
715	beqz odd, `1f` / odd buffer alignment? /
716	lui v1, `0x00ff`
717	addu v1, `0x00ff`
718	and t0, sum, v1
719	sll t0, t0, `8`
720	srl sum, sum, `8`
721	and sum, sum, v1
722	or sum, sum, t0
723	`1`:
724	#endif
725	.set pop
726	.set reorder
727	jr ra
728	.set noreorder
729	.endm
730
731	.set noreorder
732	.L_exc:
733	jr ra
734	li v0, `0`
735
736	FEXPORT(__csum_partial_copy_nocheck)
737	EXPORT_SYMBOL(__csum_partial_copy_nocheck)
738	#ifndef CONFIG_EVA
739	FEXPORT(__csum_partial_copy_to_user)
740	EXPORT_SYMBOL(__csum_partial_copy_to_user)
741	FEXPORT(__csum_partial_copy_from_user)
742	EXPORT_SYMBOL(__csum_partial_copy_from_user)
743	#endif
744	__BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP
745
746	#ifdef CONFIG_EVA
747	LEAF(__csum_partial_copy_to_user)
748	__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP
749	END(__csum_partial_copy_to_user)
750
751	LEAF(__csum_partial_copy_from_user)
752	__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP
753	END(__csum_partial_copy_from_user)
754	#endif
755

source code of linux/arch/mips/lib/csum_partial.S