1 | /* Copyright (C) 2012-2022 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2.1 of the License, or (at your option) any later version. |
8 | |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with the GNU C Library. If not, see |
16 | <https://www.gnu.org/licenses/>. */ |
17 | |
18 | #ifdef ANDROID_CHANGES |
19 | # include "machine/asm.h" |
20 | # include "machine/regdef.h" |
21 | # define USE_MEMMOVE_FOR_OVERLAP |
22 | # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED |
23 | # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE |
24 | #elif _LIBC |
25 | # include <sysdep.h> |
26 | # include <regdef.h> |
27 | # include <sys/asm.h> |
28 | # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED |
29 | # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE |
30 | #elif defined _COMPILING_NEWLIB |
31 | # include "machine/asm.h" |
32 | # include "machine/regdef.h" |
33 | # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED |
34 | # define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE |
35 | #else |
36 | # include <regdef.h> |
37 | # include <sys/asm.h> |
38 | #endif |
39 | |
40 | #if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \ |
41 | (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64) |
42 | # ifndef DISABLE_PREFETCH |
43 | # define USE_PREFETCH |
44 | # endif |
45 | #endif |
46 | |
47 | #if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)) |
48 | # ifndef DISABLE_DOUBLE |
49 | # define USE_DOUBLE |
50 | # endif |
51 | #endif |
52 | |
53 | /* Some asm.h files do not have the L macro definition. */ |
54 | #ifndef L |
55 | # if _MIPS_SIM == _ABIO32 |
56 | # define L(label) $L ## label |
57 | # else |
58 | # define L(label) .L ## label |
59 | # endif |
60 | #endif |
61 | |
62 | /* Some asm.h files do not have the PTR_ADDIU macro definition. */ |
63 | #ifndef PTR_ADDIU |
64 | # ifdef USE_DOUBLE |
65 | # define PTR_ADDIU daddiu |
66 | # else |
67 | # define PTR_ADDIU addiu |
68 | # endif |
69 | #endif |
70 | |
71 | /* Some asm.h files do not have the PTR_SRA macro definition. */ |
72 | #ifndef PTR_SRA |
73 | # ifdef USE_DOUBLE |
74 | # define PTR_SRA dsra |
75 | # else |
76 | # define PTR_SRA sra |
77 | # endif |
78 | #endif |
79 | |
80 | /* New R6 instructions that may not be in asm.h. */ |
81 | #ifndef PTR_LSA |
82 | # if _MIPS_SIM == _ABI64 |
83 | # define PTR_LSA dlsa |
84 | # else |
85 | # define PTR_LSA lsa |
86 | # endif |
87 | #endif |
88 | |
89 | /* |
90 | * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load |
91 | * prefetches appears to offer a slight preformance advantage. |
92 | * |
93 | * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE |
94 | * or PREFETCH_STORE_STREAMED offers a large performance advantage |
95 | * but PREPAREFORSTORE has some special restrictions to consider. |
96 | * |
97 | * Prefetch with the 'prepare for store' hint does not copy a memory |
98 | * location into the cache, it just allocates a cache line and zeros |
99 | * it out. This means that if you do not write to the entire cache |
100 | * line before writing it out to memory some data will get zero'ed out |
101 | * when the cache line is written back to memory and data will be lost. |
102 | * |
103 | * Also if you are using this memcpy to copy overlapping buffers it may |
104 | * not behave correctly when using the 'prepare for store' hint. If you |
105 | * use the 'prepare for store' prefetch on a memory area that is in the |
106 | * memcpy source (as well as the memcpy destination), then you will get |
107 | * some data zero'ed out before you have a chance to read it and data will |
108 | * be lost. |
109 | * |
110 | * If you are going to use this memcpy routine with the 'prepare for store' |
111 | * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid |
112 | * the problem of running memcpy on overlapping buffers. |
113 | * |
114 | * There are ifdef'ed sections of this memcpy to make sure that it does not |
115 | * do prefetches on cache lines that are not going to be completely written. |
116 | * This code is only needed and only used when PREFETCH_STORE_HINT is set to |
117 | * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are |
118 | * 32 bytes and if the cache line is larger it will not work correctly. |
119 | */ |
120 | |
121 | #ifdef USE_PREFETCH |
122 | # define PREFETCH_HINT_LOAD 0 |
123 | # define PREFETCH_HINT_STORE 1 |
124 | # define PREFETCH_HINT_LOAD_STREAMED 4 |
125 | # define PREFETCH_HINT_STORE_STREAMED 5 |
126 | # define PREFETCH_HINT_LOAD_RETAINED 6 |
127 | # define PREFETCH_HINT_STORE_RETAINED 7 |
128 | # define PREFETCH_HINT_WRITEBACK_INVAL 25 |
129 | # define PREFETCH_HINT_PREPAREFORSTORE 30 |
130 | |
131 | /* |
132 | * If we have not picked out what hints to use at this point use the |
133 | * standard load and store prefetch hints. |
134 | */ |
135 | # ifndef PREFETCH_STORE_HINT |
136 | # define PREFETCH_STORE_HINT PREFETCH_HINT_STORE |
137 | # endif |
138 | # ifndef PREFETCH_LOAD_HINT |
139 | # define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD |
140 | # endif |
141 | |
142 | /* |
143 | * We double everything when USE_DOUBLE is true so we do 2 prefetches to |
144 | * get 64 bytes in that case. The assumption is that each individual |
145 | * prefetch brings in 32 bytes. |
146 | */ |
147 | |
148 | # ifdef USE_DOUBLE |
149 | # define PREFETCH_CHUNK 64 |
150 | # define PREFETCH_FOR_LOAD(chunk, reg) \ |
151 | pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \ |
152 | pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg) |
153 | # define PREFETCH_FOR_STORE(chunk, reg) \ |
154 | pref PREFETCH_STORE_HINT, (chunk)*64(reg); \ |
155 | pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg) |
156 | # else |
157 | # define PREFETCH_CHUNK 32 |
158 | # define PREFETCH_FOR_LOAD(chunk, reg) \ |
159 | pref PREFETCH_LOAD_HINT, (chunk)*32(reg) |
160 | # define PREFETCH_FOR_STORE(chunk, reg) \ |
161 | pref PREFETCH_STORE_HINT, (chunk)*32(reg) |
162 | # endif |
163 | /* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less |
164 | * than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size |
165 | * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE |
166 | * hint is used, the code will not work correctly. If PREPAREFORSTORE is not |
167 | * used then MAX_PREFETCH_SIZE does not matter. */ |
168 | # define MAX_PREFETCH_SIZE 128 |
169 | /* PREFETCH_LIMIT is set based on the fact that we never use an offset greater |
170 | * than 5 on a STORE prefetch and that a single prefetch can never be larger |
171 | * than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because |
172 | * we actually do two prefetches in that case, one 32 bytes after the other. */ |
173 | # ifdef USE_DOUBLE |
174 | # define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE |
175 | # else |
176 | # define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE |
177 | # endif |
178 | # if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \ |
179 | && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE) |
180 | /* We cannot handle this because the initial prefetches may fetch bytes that |
181 | * are before the buffer being copied. We start copies with an offset |
182 | * of 4 so avoid this situation when using PREPAREFORSTORE. */ |
183 | #error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small." |
184 | # endif |
185 | #else /* USE_PREFETCH not defined */ |
186 | # define PREFETCH_FOR_LOAD(offset, reg) |
187 | # define PREFETCH_FOR_STORE(offset, reg) |
188 | #endif |
189 | |
190 | #if __mips_isa_rev > 5 |
191 | # if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) |
192 | # undef PREFETCH_STORE_HINT |
193 | # define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED |
194 | # endif |
195 | # define R6_CODE |
196 | #endif |
197 | |
198 | /* Allow the routine to be named something else if desired. */ |
199 | #ifndef MEMCPY_NAME |
200 | # define MEMCPY_NAME memcpy |
201 | #endif |
202 | |
203 | /* We use these 32/64 bit registers as temporaries to do the copying. */ |
204 | #define REG0 t0 |
205 | #define REG1 t1 |
206 | #define REG2 t2 |
207 | #define REG3 t3 |
208 | #if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64)) |
209 | # define REG4 t4 |
210 | # define REG5 t5 |
211 | # define REG6 t6 |
212 | # define REG7 t7 |
213 | #else |
214 | # define REG4 ta0 |
215 | # define REG5 ta1 |
216 | # define REG6 ta2 |
217 | # define REG7 ta3 |
218 | #endif |
219 | |
220 | /* We load/store 64 bits at a time when USE_DOUBLE is true. |
221 | * The C_ prefix stands for CHUNK and is used to avoid macro name |
222 | * conflicts with system header files. */ |
223 | |
224 | #ifdef USE_DOUBLE |
225 | # define C_ST sd |
226 | # define C_LD ld |
227 | # ifdef __MIPSEB |
228 | # define C_LDHI ldl /* high part is left in big-endian */ |
229 | # define C_STHI sdl /* high part is left in big-endian */ |
230 | # define C_LDLO ldr /* low part is right in big-endian */ |
231 | # define C_STLO sdr /* low part is right in big-endian */ |
232 | # else |
233 | # define C_LDHI ldr /* high part is right in little-endian */ |
234 | # define C_STHI sdr /* high part is right in little-endian */ |
235 | # define C_LDLO ldl /* low part is left in little-endian */ |
236 | # define C_STLO sdl /* low part is left in little-endian */ |
237 | # endif |
238 | # define C_ALIGN dalign /* r6 align instruction */ |
239 | #else |
240 | # define C_ST sw |
241 | # define C_LD lw |
242 | # ifdef __MIPSEB |
243 | # define C_LDHI lwl /* high part is left in big-endian */ |
244 | # define C_STHI swl /* high part is left in big-endian */ |
245 | # define C_LDLO lwr /* low part is right in big-endian */ |
246 | # define C_STLO swr /* low part is right in big-endian */ |
247 | # else |
248 | # define C_LDHI lwr /* high part is right in little-endian */ |
249 | # define C_STHI swr /* high part is right in little-endian */ |
250 | # define C_LDLO lwl /* low part is left in little-endian */ |
251 | # define C_STLO swl /* low part is left in little-endian */ |
252 | # endif |
253 | # define C_ALIGN align /* r6 align instruction */ |
254 | #endif |
255 | |
256 | /* Bookkeeping values for 32 vs. 64 bit mode. */ |
257 | #ifdef USE_DOUBLE |
258 | # define NSIZE 8 |
259 | # define NSIZEMASK 0x3f |
260 | # define NSIZEDMASK 0x7f |
261 | #else |
262 | # define NSIZE 4 |
263 | # define NSIZEMASK 0x1f |
264 | # define NSIZEDMASK 0x3f |
265 | #endif |
266 | #define UNIT(unit) ((unit)*NSIZE) |
267 | #define UNITM1(unit) (((unit)*NSIZE)-1) |
268 | |
269 | #ifdef ANDROID_CHANGES |
270 | LEAF(MEMCPY_NAME, 0) |
271 | #else |
272 | LEAF(MEMCPY_NAME) |
273 | #endif |
274 | .set nomips16 |
275 | .set noreorder |
276 | /* |
277 | * Below we handle the case where memcpy is called with overlapping src and dst. |
278 | * Although memcpy is not required to handle this case, some parts of Android |
279 | * like Skia rely on such usage. We call memmove to handle such cases. |
280 | */ |
281 | #ifdef USE_MEMMOVE_FOR_OVERLAP |
282 | PTR_SUBU t0,a0,a1 |
283 | PTR_SRA t2,t0,31 |
284 | xor t1,t0,t2 |
285 | PTR_SUBU t0,t1,t2 |
286 | sltu t2,t0,a2 |
287 | beq t2,zero,L(memcpy) |
288 | la t9,memmove |
289 | jr t9 |
290 | nop |
291 | L(memcpy): |
292 | #endif |
293 | /* |
294 | * If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of |
295 | * size, copy dst pointer to v0 for the return value. |
296 | */ |
297 | slti t2,a2,(2 * NSIZE) |
298 | bne t2,zero,L(lasts) |
299 | #if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH) |
300 | move v0,zero |
301 | #else |
302 | move v0,a0 |
303 | #endif |
304 | |
305 | #ifndef R6_CODE |
306 | |
307 | /* |
308 | * If src and dst have different alignments, go to L(unaligned), if they |
309 | * have the same alignment (but are not actually aligned) do a partial |
310 | * load/store to make them aligned. If they are both already aligned |
311 | * we can start copying at L(aligned). |
312 | */ |
313 | xor t8,a1,a0 |
314 | andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */ |
315 | bne t8,zero,L(unaligned) |
316 | PTR_SUBU a3, zero, a0 |
317 | |
318 | andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ |
319 | beq a3,zero,L(aligned) /* if a3=0, it is already aligned */ |
320 | PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ |
321 | |
322 | C_LDHI t8,0(a1) |
323 | PTR_ADDU a1,a1,a3 |
324 | C_STHI t8,0(a0) |
325 | PTR_ADDU a0,a0,a3 |
326 | |
327 | #else /* R6_CODE */ |
328 | |
329 | /* |
330 | * Align the destination and hope that the source gets aligned too. If it |
331 | * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6 |
332 | * align instruction. |
333 | */ |
334 | andi t8,a0,7 |
335 | lapc t9,L(atable) |
336 | PTR_LSA t9,t8,t9,2 |
337 | jrc t9 |
338 | L(atable): |
339 | bc L(lb0) |
340 | bc L(lb7) |
341 | bc L(lb6) |
342 | bc L(lb5) |
343 | bc L(lb4) |
344 | bc L(lb3) |
345 | bc L(lb2) |
346 | bc L(lb1) |
347 | L(lb7): |
348 | lb a3, 6(a1) |
349 | sb a3, 6(a0) |
350 | L(lb6): |
351 | lb a3, 5(a1) |
352 | sb a3, 5(a0) |
353 | L(lb5): |
354 | lb a3, 4(a1) |
355 | sb a3, 4(a0) |
356 | L(lb4): |
357 | lb a3, 3(a1) |
358 | sb a3, 3(a0) |
359 | L(lb3): |
360 | lb a3, 2(a1) |
361 | sb a3, 2(a0) |
362 | L(lb2): |
363 | lb a3, 1(a1) |
364 | sb a3, 1(a0) |
365 | L(lb1): |
366 | lb a3, 0(a1) |
367 | sb a3, 0(a0) |
368 | |
369 | li t9,8 |
370 | subu t8,t9,t8 |
371 | PTR_SUBU a2,a2,t8 |
372 | PTR_ADDU a0,a0,t8 |
373 | PTR_ADDU a1,a1,t8 |
374 | L(lb0): |
375 | |
376 | andi t8,a1,(NSIZE-1) |
377 | lapc t9,L(jtable) |
378 | PTR_LSA t9,t8,t9,2 |
379 | jrc t9 |
380 | L(jtable): |
381 | bc L(aligned) |
382 | bc L(r6_unaligned1) |
383 | bc L(r6_unaligned2) |
384 | bc L(r6_unaligned3) |
385 | # ifdef USE_DOUBLE |
386 | bc L(r6_unaligned4) |
387 | bc L(r6_unaligned5) |
388 | bc L(r6_unaligned6) |
389 | bc L(r6_unaligned7) |
390 | # endif |
391 | #endif /* R6_CODE */ |
392 | |
393 | L(aligned): |
394 | |
395 | /* |
396 | * Now dst/src are both aligned to (word or double word) aligned addresses |
397 | * Set a2 to count how many bytes we have to copy after all the 64/128 byte |
398 | * chunks are copied and a3 to the dst pointer after all the 64/128 byte |
399 | * chunks have been copied. We will loop, incrementing a0 and a1 until a0 |
400 | * equals a3. |
401 | */ |
402 | |
403 | andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ |
404 | beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */ |
405 | PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ |
406 | PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ |
407 | |
408 | /* When in the loop we may prefetch with the 'prepare to store' hint, |
409 | * in this case the a0+x should not be past the "t0-32" address. This |
410 | * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively, |
411 | * for x=64 the last "safe" a0 address is "t0-96" In the current version we |
412 | * will use "prefetch hint,128(a0)", so "t0-160" is the limit. |
413 | */ |
414 | #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) |
415 | PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ |
416 | PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ |
417 | #endif |
418 | PREFETCH_FOR_LOAD (0, a1) |
419 | PREFETCH_FOR_LOAD (1, a1) |
420 | PREFETCH_FOR_LOAD (2, a1) |
421 | PREFETCH_FOR_LOAD (3, a1) |
422 | #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) |
423 | PREFETCH_FOR_STORE (1, a0) |
424 | PREFETCH_FOR_STORE (2, a0) |
425 | PREFETCH_FOR_STORE (3, a0) |
426 | #endif |
427 | #if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) |
428 | # if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE |
429 | sltu v1,t9,a0 |
430 | bgtz v1,L(skip_set) |
431 | nop |
432 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) |
433 | L(skip_set): |
434 | # else |
435 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) |
436 | # endif |
437 | #endif |
438 | #if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \ |
439 | && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) |
440 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3) |
441 | # ifdef USE_DOUBLE |
442 | PTR_ADDIU v0,v0,32 |
443 | # endif |
444 | #endif |
445 | L(loop16w): |
446 | C_LD t0,UNIT(0)(a1) |
447 | #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) |
448 | sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */ |
449 | bgtz v1,L(skip_pref) |
450 | #endif |
451 | C_LD t1,UNIT(1)(a1) |
452 | #ifdef R6_CODE |
453 | PREFETCH_FOR_STORE (2, a0) |
454 | #else |
455 | PREFETCH_FOR_STORE (4, a0) |
456 | PREFETCH_FOR_STORE (5, a0) |
457 | #endif |
458 | #if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) |
459 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5) |
460 | # ifdef USE_DOUBLE |
461 | PTR_ADDIU v0,v0,32 |
462 | # endif |
463 | #endif |
464 | L(skip_pref): |
465 | C_LD REG2,UNIT(2)(a1) |
466 | C_LD REG3,UNIT(3)(a1) |
467 | C_LD REG4,UNIT(4)(a1) |
468 | C_LD REG5,UNIT(5)(a1) |
469 | C_LD REG6,UNIT(6)(a1) |
470 | C_LD REG7,UNIT(7)(a1) |
471 | #ifdef R6_CODE |
472 | PREFETCH_FOR_LOAD (3, a1) |
473 | #else |
474 | PREFETCH_FOR_LOAD (4, a1) |
475 | #endif |
476 | C_ST t0,UNIT(0)(a0) |
477 | C_ST t1,UNIT(1)(a0) |
478 | C_ST REG2,UNIT(2)(a0) |
479 | C_ST REG3,UNIT(3)(a0) |
480 | C_ST REG4,UNIT(4)(a0) |
481 | C_ST REG5,UNIT(5)(a0) |
482 | C_ST REG6,UNIT(6)(a0) |
483 | C_ST REG7,UNIT(7)(a0) |
484 | |
485 | C_LD t0,UNIT(8)(a1) |
486 | C_LD t1,UNIT(9)(a1) |
487 | C_LD REG2,UNIT(10)(a1) |
488 | C_LD REG3,UNIT(11)(a1) |
489 | C_LD REG4,UNIT(12)(a1) |
490 | C_LD REG5,UNIT(13)(a1) |
491 | C_LD REG6,UNIT(14)(a1) |
492 | C_LD REG7,UNIT(15)(a1) |
493 | #ifndef R6_CODE |
494 | PREFETCH_FOR_LOAD (5, a1) |
495 | #endif |
496 | C_ST t0,UNIT(8)(a0) |
497 | C_ST t1,UNIT(9)(a0) |
498 | C_ST REG2,UNIT(10)(a0) |
499 | C_ST REG3,UNIT(11)(a0) |
500 | C_ST REG4,UNIT(12)(a0) |
501 | C_ST REG5,UNIT(13)(a0) |
502 | C_ST REG6,UNIT(14)(a0) |
503 | C_ST REG7,UNIT(15)(a0) |
504 | PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ |
505 | bne a0,a3,L(loop16w) |
506 | PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ |
507 | move a2,t8 |
508 | |
509 | /* Here we have src and dest word-aligned but less than 64-bytes or |
510 | * 128 bytes to go. Check for a 32(64) byte chunk and copy if there |
511 | * is one. Otherwise jump down to L(chk1w) to handle the tail end of |
512 | * the copy. |
513 | */ |
514 | |
515 | L(chkw): |
516 | PREFETCH_FOR_LOAD (0, a1) |
517 | andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ |
518 | /* The t8 is the reminder count past 32-bytes */ |
519 | beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */ |
520 | nop |
521 | C_LD t0,UNIT(0)(a1) |
522 | C_LD t1,UNIT(1)(a1) |
523 | C_LD REG2,UNIT(2)(a1) |
524 | C_LD REG3,UNIT(3)(a1) |
525 | C_LD REG4,UNIT(4)(a1) |
526 | C_LD REG5,UNIT(5)(a1) |
527 | C_LD REG6,UNIT(6)(a1) |
528 | C_LD REG7,UNIT(7)(a1) |
529 | PTR_ADDIU a1,a1,UNIT(8) |
530 | C_ST t0,UNIT(0)(a0) |
531 | C_ST t1,UNIT(1)(a0) |
532 | C_ST REG2,UNIT(2)(a0) |
533 | C_ST REG3,UNIT(3)(a0) |
534 | C_ST REG4,UNIT(4)(a0) |
535 | C_ST REG5,UNIT(5)(a0) |
536 | C_ST REG6,UNIT(6)(a0) |
537 | C_ST REG7,UNIT(7)(a0) |
538 | PTR_ADDIU a0,a0,UNIT(8) |
539 | |
540 | /* |
541 | * Here we have less than 32(64) bytes to copy. Set up for a loop to |
542 | * copy one word (or double word) at a time. Set a2 to count how many |
543 | * bytes we have to copy after all the word (or double word) chunks are |
544 | * copied and a3 to the dst pointer after all the (d)word chunks have |
545 | * been copied. We will loop, incrementing a0 and a1 until a0 equals a3. |
546 | */ |
547 | L(chk1w): |
548 | andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ |
549 | beq a2,t8,L(lastw) |
550 | PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ |
551 | PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ |
552 | |
553 | /* copying in words (4-byte or 8-byte chunks) */ |
554 | L(wordCopy_loop): |
555 | C_LD REG3,UNIT(0)(a1) |
556 | PTR_ADDIU a0,a0,UNIT(1) |
557 | PTR_ADDIU a1,a1,UNIT(1) |
558 | bne a0,a3,L(wordCopy_loop) |
559 | C_ST REG3,UNIT(-1)(a0) |
560 | |
561 | /* If we have been copying double words, see if we can copy a single word |
562 | before doing byte copies. We can have, at most, one word to copy. */ |
563 | |
564 | L(lastw): |
565 | #ifdef USE_DOUBLE |
566 | andi t8,a2,3 /* a2 is the remainder past 4 byte chunks. */ |
567 | beq t8,a2,L(lastb) |
568 | move a2,t8 |
569 | lw REG3,0(a1) |
570 | sw REG3,0(a0) |
571 | PTR_ADDIU a0,a0,4 |
572 | PTR_ADDIU a1,a1,4 |
573 | #endif |
574 | |
575 | /* Copy the last 8 (or 16) bytes */ |
576 | L(lastb): |
577 | blez a2,L(leave) |
578 | PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ |
579 | L(lastbloop): |
580 | lb v1,0(a1) |
581 | PTR_ADDIU a0,a0,1 |
582 | PTR_ADDIU a1,a1,1 |
583 | bne a0,a3,L(lastbloop) |
584 | sb v1,-1(a0) |
585 | L(leave): |
586 | j ra |
587 | nop |
588 | |
589 | /* We jump here with a memcpy of less than 8 or 16 bytes, depending on |
590 | whether or not USE_DOUBLE is defined. Instead of just doing byte |
591 | copies, check the alignment and size and use lw/sw if possible. |
592 | Otherwise, do byte copies. */ |
593 | |
594 | L(lasts): |
595 | andi t8,a2,3 |
596 | beq t8,a2,L(lastb) |
597 | |
598 | andi t9,a0,3 |
599 | bne t9,zero,L(lastb) |
600 | andi t9,a1,3 |
601 | bne t9,zero,L(lastb) |
602 | |
603 | PTR_SUBU a3,a2,t8 |
604 | PTR_ADDU a3,a0,a3 |
605 | |
606 | L(wcopy_loop): |
607 | lw REG3,0(a1) |
608 | PTR_ADDIU a0,a0,4 |
609 | PTR_ADDIU a1,a1,4 |
610 | bne a0,a3,L(wcopy_loop) |
611 | sw REG3,-4(a0) |
612 | |
613 | b L(lastb) |
614 | move a2,t8 |
615 | |
616 | #ifndef R6_CODE |
617 | /* |
618 | * UNALIGNED case, got here with a3 = "negu a0" |
619 | * This code is nearly identical to the aligned code above |
620 | * but only the destination (not the source) gets aligned |
621 | * so we need to do partial loads of the source followed |
622 | * by normal stores to the destination (once we have aligned |
623 | * the destination). |
624 | */ |
625 | |
626 | L(unaligned): |
627 | andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ |
628 | beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */ |
629 | PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ |
630 | |
631 | C_LDHI v1,UNIT(0)(a1) |
632 | C_LDLO v1,UNITM1(1)(a1) |
633 | PTR_ADDU a1,a1,a3 |
634 | C_STHI v1,UNIT(0)(a0) |
635 | PTR_ADDU a0,a0,a3 |
636 | |
637 | /* |
638 | * Now the destination (but not the source) is aligned |
639 | * Set a2 to count how many bytes we have to copy after all the 64/128 byte |
640 | * chunks are copied and a3 to the dst pointer after all the 64/128 byte |
641 | * chunks have been copied. We will loop, incrementing a0 and a1 until a0 |
642 | * equals a3. |
643 | */ |
644 | |
645 | L(ua_chk16w): |
646 | andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ |
647 | beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */ |
648 | PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ |
649 | PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ |
650 | |
651 | # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) |
652 | PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ |
653 | PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ |
654 | # endif |
655 | PREFETCH_FOR_LOAD (0, a1) |
656 | PREFETCH_FOR_LOAD (1, a1) |
657 | PREFETCH_FOR_LOAD (2, a1) |
658 | # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) |
659 | PREFETCH_FOR_STORE (1, a0) |
660 | PREFETCH_FOR_STORE (2, a0) |
661 | PREFETCH_FOR_STORE (3, a0) |
662 | # endif |
663 | # if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) |
664 | # if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) |
665 | sltu v1,t9,a0 |
666 | bgtz v1,L(ua_skip_set) |
667 | nop |
668 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) |
669 | L(ua_skip_set): |
670 | # else |
671 | PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) |
672 | # endif |
673 | # endif |
674 | L(ua_loop16w): |
675 | PREFETCH_FOR_LOAD (3, a1) |
676 | C_LDHI t0,UNIT(0)(a1) |
677 | C_LDHI t1,UNIT(1)(a1) |
678 | C_LDHI REG2,UNIT(2)(a1) |
679 | # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) |
680 | sltu v1,t9,a0 |
681 | bgtz v1,L(ua_skip_pref) |
682 | # endif |
683 | C_LDHI REG3,UNIT(3)(a1) |
684 | PREFETCH_FOR_STORE (4, a0) |
685 | PREFETCH_FOR_STORE (5, a0) |
686 | L(ua_skip_pref): |
687 | C_LDHI REG4,UNIT(4)(a1) |
688 | C_LDHI REG5,UNIT(5)(a1) |
689 | C_LDHI REG6,UNIT(6)(a1) |
690 | C_LDHI REG7,UNIT(7)(a1) |
691 | C_LDLO t0,UNITM1(1)(a1) |
692 | C_LDLO t1,UNITM1(2)(a1) |
693 | C_LDLO REG2,UNITM1(3)(a1) |
694 | C_LDLO REG3,UNITM1(4)(a1) |
695 | C_LDLO REG4,UNITM1(5)(a1) |
696 | C_LDLO REG5,UNITM1(6)(a1) |
697 | C_LDLO REG6,UNITM1(7)(a1) |
698 | C_LDLO REG7,UNITM1(8)(a1) |
699 | PREFETCH_FOR_LOAD (4, a1) |
700 | C_ST t0,UNIT(0)(a0) |
701 | C_ST t1,UNIT(1)(a0) |
702 | C_ST REG2,UNIT(2)(a0) |
703 | C_ST REG3,UNIT(3)(a0) |
704 | C_ST REG4,UNIT(4)(a0) |
705 | C_ST REG5,UNIT(5)(a0) |
706 | C_ST REG6,UNIT(6)(a0) |
707 | C_ST REG7,UNIT(7)(a0) |
708 | C_LDHI t0,UNIT(8)(a1) |
709 | C_LDHI t1,UNIT(9)(a1) |
710 | C_LDHI REG2,UNIT(10)(a1) |
711 | C_LDHI REG3,UNIT(11)(a1) |
712 | C_LDHI REG4,UNIT(12)(a1) |
713 | C_LDHI REG5,UNIT(13)(a1) |
714 | C_LDHI REG6,UNIT(14)(a1) |
715 | C_LDHI REG7,UNIT(15)(a1) |
716 | C_LDLO t0,UNITM1(9)(a1) |
717 | C_LDLO t1,UNITM1(10)(a1) |
718 | C_LDLO REG2,UNITM1(11)(a1) |
719 | C_LDLO REG3,UNITM1(12)(a1) |
720 | C_LDLO REG4,UNITM1(13)(a1) |
721 | C_LDLO REG5,UNITM1(14)(a1) |
722 | C_LDLO REG6,UNITM1(15)(a1) |
723 | C_LDLO REG7,UNITM1(16)(a1) |
724 | PREFETCH_FOR_LOAD (5, a1) |
725 | C_ST t0,UNIT(8)(a0) |
726 | C_ST t1,UNIT(9)(a0) |
727 | C_ST REG2,UNIT(10)(a0) |
728 | C_ST REG3,UNIT(11)(a0) |
729 | C_ST REG4,UNIT(12)(a0) |
730 | C_ST REG5,UNIT(13)(a0) |
731 | C_ST REG6,UNIT(14)(a0) |
732 | C_ST REG7,UNIT(15)(a0) |
733 | PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ |
734 | bne a0,a3,L(ua_loop16w) |
735 | PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ |
736 | move a2,t8 |
737 | |
738 | /* Here we have src and dest word-aligned but less than 64-bytes or |
739 | * 128 bytes to go. Check for a 32(64) byte chunk and copy if there |
740 | * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of |
741 | * the copy. */ |
742 | |
743 | L(ua_chkw): |
744 | PREFETCH_FOR_LOAD (0, a1) |
745 | andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ |
746 | /* t8 is the reminder count past 32-bytes */ |
747 | beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */ |
748 | nop |
749 | C_LDHI t0,UNIT(0)(a1) |
750 | C_LDHI t1,UNIT(1)(a1) |
751 | C_LDHI REG2,UNIT(2)(a1) |
752 | C_LDHI REG3,UNIT(3)(a1) |
753 | C_LDHI REG4,UNIT(4)(a1) |
754 | C_LDHI REG5,UNIT(5)(a1) |
755 | C_LDHI REG6,UNIT(6)(a1) |
756 | C_LDHI REG7,UNIT(7)(a1) |
757 | C_LDLO t0,UNITM1(1)(a1) |
758 | C_LDLO t1,UNITM1(2)(a1) |
759 | C_LDLO REG2,UNITM1(3)(a1) |
760 | C_LDLO REG3,UNITM1(4)(a1) |
761 | C_LDLO REG4,UNITM1(5)(a1) |
762 | C_LDLO REG5,UNITM1(6)(a1) |
763 | C_LDLO REG6,UNITM1(7)(a1) |
764 | C_LDLO REG7,UNITM1(8)(a1) |
765 | PTR_ADDIU a1,a1,UNIT(8) |
766 | C_ST t0,UNIT(0)(a0) |
767 | C_ST t1,UNIT(1)(a0) |
768 | C_ST REG2,UNIT(2)(a0) |
769 | C_ST REG3,UNIT(3)(a0) |
770 | C_ST REG4,UNIT(4)(a0) |
771 | C_ST REG5,UNIT(5)(a0) |
772 | C_ST REG6,UNIT(6)(a0) |
773 | C_ST REG7,UNIT(7)(a0) |
774 | PTR_ADDIU a0,a0,UNIT(8) |
775 | /* |
776 | * Here we have less than 32(64) bytes to copy. Set up for a loop to |
777 | * copy one word (or double word) at a time. |
778 | */ |
779 | L(ua_chk1w): |
780 | andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ |
781 | beq a2,t8,L(ua_smallCopy) |
782 | PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ |
783 | PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ |
784 | |
785 | /* copying in words (4-byte or 8-byte chunks) */ |
786 | L(ua_wordCopy_loop): |
787 | C_LDHI v1,UNIT(0)(a1) |
788 | C_LDLO v1,UNITM1(1)(a1) |
789 | PTR_ADDIU a0,a0,UNIT(1) |
790 | PTR_ADDIU a1,a1,UNIT(1) |
791 | bne a0,a3,L(ua_wordCopy_loop) |
792 | C_ST v1,UNIT(-1)(a0) |
793 | |
794 | /* Copy the last 8 (or 16) bytes */ |
795 | L(ua_smallCopy): |
796 | beqz a2,L(leave) |
797 | PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ |
798 | L(ua_smallCopy_loop): |
799 | lb v1,0(a1) |
800 | PTR_ADDIU a0,a0,1 |
801 | PTR_ADDIU a1,a1,1 |
802 | bne a0,a3,L(ua_smallCopy_loop) |
803 | sb v1,-1(a0) |
804 | |
805 | j ra |
806 | nop |
807 | |
808 | #else /* R6_CODE */ |
809 | |
810 | # ifdef __MIPSEB |
811 | # define SWAP_REGS(X,Y) X, Y |
812 | # define ALIGN_OFFSET(N) (N) |
813 | # else |
814 | # define SWAP_REGS(X,Y) Y, X |
815 | # define ALIGN_OFFSET(N) (NSIZE-N) |
816 | # endif |
817 | # define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \ |
818 | andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \ |
819 | beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \ |
820 | PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \ |
821 | /* (d)word chunks. */ \ |
822 | move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \ |
823 | /* after word loop is finished. */ \ |
824 | PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \ |
825 | PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \ |
826 | PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \ |
827 | C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \ |
828 | L(r6_ua_wordcopy##BYTEOFFSET): \ |
829 | C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \ |
830 | C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \ |
831 | PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \ |
832 | PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \ |
833 | move t0, t1; /* Move second part of source to first. */ \ |
834 | bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \ |
835 | C_ST REG3, UNIT(-1)(a0); \ |
836 | j L(lastb); \ |
837 | nop |
838 | |
839 | /* We are generating R6 code, the destination is 4 byte aligned and |
840 | the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the |
841 | alignment of the source. */ |
842 | |
843 | L(r6_unaligned1): |
844 | R6_UNALIGNED_WORD_COPY(1) |
845 | L(r6_unaligned2): |
846 | R6_UNALIGNED_WORD_COPY(2) |
847 | L(r6_unaligned3): |
848 | R6_UNALIGNED_WORD_COPY(3) |
849 | # ifdef USE_DOUBLE |
850 | L(r6_unaligned4): |
851 | R6_UNALIGNED_WORD_COPY(4) |
852 | L(r6_unaligned5): |
853 | R6_UNALIGNED_WORD_COPY(5) |
854 | L(r6_unaligned6): |
855 | R6_UNALIGNED_WORD_COPY(6) |
856 | L(r6_unaligned7): |
857 | R6_UNALIGNED_WORD_COPY(7) |
858 | # endif |
859 | #endif /* R6_CODE */ |
860 | |
861 | .set at |
862 | .set reorder |
863 | END(MEMCPY_NAME) |
864 | #ifndef ANDROID_CHANGES |
865 | # ifdef _LIBC |
866 | libc_hidden_builtin_def (MEMCPY_NAME) |
867 | # endif |
868 | #endif |
869 | |