1 | /* Optimized memmove implementation for PowerPC64/POWER7. |
2 | Copyright (C) 2014-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | |
22 | /* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5]) |
23 | |
24 | This optimization check if memory 'dest' overlaps with 'src'. If it does |
25 | not then it calls an optimized memcpy call (similar to memcpy for POWER7, |
26 | embedded here to gain some cycles). |
27 | If source and destiny overlaps, a optimized backwards memcpy is used |
28 | instead. */ |
29 | |
30 | #ifndef MEMMOVE |
31 | # define MEMMOVE memmove |
32 | #endif |
33 | .machine power7 |
34 | ENTRY_TOCLESS (MEMMOVE, 5) |
35 | CALL_MCOUNT 3 |
36 | |
37 | L(_memmove): |
38 | subf r9,r4,r3 |
39 | cmpld cr7,r9,r5 |
40 | blt cr7,L(memmove_bwd) |
41 | |
42 | cmpldi cr1,r5,31 |
43 | neg 0,3 |
44 | ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move |
45 | code. */ |
46 | |
47 | andi. 10,3,15 |
48 | clrldi 11,4,60 |
49 | cmpld cr6,10,11 /* SRC and DST alignments match? */ |
50 | |
51 | mr r11,3 |
52 | bne cr6,L(copy_GE_32_unaligned) |
53 | beq L(aligned_copy) |
54 | |
55 | mtocrf 0x01,0 |
56 | clrldi 0,0,60 |
57 | |
58 | /* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ |
59 | 1: |
60 | bf 31,2f |
61 | lbz 6,0(r4) |
62 | addi r4,r4,1 |
63 | stb 6,0(r11) |
64 | addi r11,r11,1 |
65 | 2: |
66 | bf 30,4f |
67 | lhz 6,0(r4) |
68 | addi r4,r4,2 |
69 | sth 6,0(r11) |
70 | addi r11,r11,2 |
71 | 4: |
72 | bf 29,8f |
73 | lwz 6,0(r4) |
74 | addi r4,r4,4 |
75 | stw 6,0(r11) |
76 | addi r11,r11,4 |
77 | 8: |
78 | bf 28,16f |
79 | ld 6,0(r4) |
80 | addi r4,r4,8 |
81 | std 6,0(r11) |
82 | addi r11,r11,8 |
83 | 16: |
84 | subf r5,0,r5 |
85 | |
86 | /* Main aligned copy loop. Copies 128 bytes at a time. */ |
87 | L(aligned_copy): |
88 | li 6,16 |
89 | li 7,32 |
90 | li 8,48 |
91 | mtocrf 0x02,r5 |
92 | srdi 12,r5,7 |
93 | cmpdi 12,0 |
94 | beq L(aligned_tail) |
95 | lvx 6,0,r4 |
96 | lvx 7,r4,6 |
97 | mtctr 12 |
98 | b L(aligned_128loop) |
99 | |
100 | .align 4 |
101 | L(aligned_128head): |
102 | /* for the 2nd + iteration of this loop. */ |
103 | lvx 6,0,r4 |
104 | lvx 7,r4,6 |
105 | L(aligned_128loop): |
106 | lvx 8,r4,7 |
107 | lvx 9,r4,8 |
108 | stvx 6,0,r11 |
109 | addi r4,r4,64 |
110 | stvx 7,r11,6 |
111 | stvx 8,r11,7 |
112 | stvx 9,r11,8 |
113 | lvx 6,0,r4 |
114 | lvx 7,r4,6 |
115 | addi r11,r11,64 |
116 | lvx 8,r4,7 |
117 | lvx 9,r4,8 |
118 | addi r4,r4,64 |
119 | stvx 6,0,r11 |
120 | stvx 7,r11,6 |
121 | stvx 8,r11,7 |
122 | stvx 9,r11,8 |
123 | addi r11,r11,64 |
124 | bdnz L(aligned_128head) |
125 | |
126 | L(aligned_tail): |
127 | mtocrf 0x01,r5 |
128 | bf 25,32f |
129 | lvx 6,0,r4 |
130 | lvx 7,r4,6 |
131 | lvx 8,r4,7 |
132 | lvx 9,r4,8 |
133 | addi r4,r4,64 |
134 | stvx 6,0,r11 |
135 | stvx 7,r11,6 |
136 | stvx 8,r11,7 |
137 | stvx 9,r11,8 |
138 | addi r11,r11,64 |
139 | 32: |
140 | bf 26,16f |
141 | lvx 6,0,r4 |
142 | lvx 7,r4,6 |
143 | addi r4,r4,32 |
144 | stvx 6,0,r11 |
145 | stvx 7,r11,6 |
146 | addi r11,r11,32 |
147 | 16: |
148 | bf 27,8f |
149 | lvx 6,0,r4 |
150 | addi r4,r4,16 |
151 | stvx 6,0,r11 |
152 | addi r11,r11,16 |
153 | 8: |
154 | bf 28,4f |
155 | ld 6,0(r4) |
156 | addi r4,r4,8 |
157 | std 6,0(r11) |
158 | addi r11,r11,8 |
159 | 4: /* Copies 4~7 bytes. */ |
160 | bf 29,L(tail2) |
161 | lwz 6,0(r4) |
162 | stw 6,0(r11) |
163 | bf 30,L(tail5) |
164 | lhz 7,4(r4) |
165 | sth 7,4(r11) |
166 | bflr 31 |
167 | lbz 8,6(r4) |
168 | stb 8,6(r11) |
169 | /* Return original DST pointer. */ |
170 | blr |
171 | |
172 | /* Handle copies of 0~31 bytes. */ |
173 | .align 4 |
174 | L(copy_LT_32): |
175 | mr r11,3 |
176 | cmpldi cr6,r5,8 |
177 | mtocrf 0x01,r5 |
178 | ble cr6,L(copy_LE_8) |
179 | |
180 | /* At least 9 bytes to go. */ |
181 | neg 8,4 |
182 | andi. 0,8,3 |
183 | cmpldi cr1,r5,16 |
184 | beq L(copy_LT_32_aligned) |
185 | |
186 | /* Force 4-byte alignment for SRC. */ |
187 | mtocrf 0x01,0 |
188 | subf r5,0,r5 |
189 | 2: |
190 | bf 30,1f |
191 | lhz 6,0(r4) |
192 | addi r4,r4,2 |
193 | sth 6,0(r11) |
194 | addi r11,r11,2 |
195 | 1: |
196 | bf 31,L(end_4bytes_alignment) |
197 | lbz 6,0(r4) |
198 | addi r4,r4,1 |
199 | stb 6,0(r11) |
200 | addi r11,r11,1 |
201 | |
202 | .align 4 |
203 | L(end_4bytes_alignment): |
204 | cmpldi cr1,r5,16 |
205 | mtocrf 0x01,r5 |
206 | |
207 | L(copy_LT_32_aligned): |
208 | /* At least 6 bytes to go, and SRC is word-aligned. */ |
209 | blt cr1,8f |
210 | |
211 | /* Copy 16 bytes. */ |
212 | lwz 6,0(r4) |
213 | lwz 7,4(r4) |
214 | stw 6,0(r11) |
215 | lwz 8,8(r4) |
216 | stw 7,4(r11) |
217 | lwz 6,12(r4) |
218 | addi r4,r4,16 |
219 | stw 8,8(r11) |
220 | stw 6,12(r11) |
221 | addi r11,r11,16 |
222 | 8: /* Copy 8 bytes. */ |
223 | bf 28,L(tail4) |
224 | lwz 6,0(r4) |
225 | lwz 7,4(r4) |
226 | addi r4,r4,8 |
227 | stw 6,0(r11) |
228 | stw 7,4(r11) |
229 | addi r11,r11,8 |
230 | |
231 | .align 4 |
232 | /* Copies 4~7 bytes. */ |
233 | L(tail4): |
234 | bf 29,L(tail2) |
235 | lwz 6,0(r4) |
236 | stw 6,0(r11) |
237 | bf 30,L(tail5) |
238 | lhz 7,4(r4) |
239 | sth 7,4(r11) |
240 | bflr 31 |
241 | lbz 8,6(r4) |
242 | stb 8,6(r11) |
243 | /* Return original DST pointer. */ |
244 | blr |
245 | |
246 | .align 4 |
247 | /* Copies 2~3 bytes. */ |
248 | L(tail2): |
249 | bf 30,1f |
250 | lhz 6,0(r4) |
251 | sth 6,0(r11) |
252 | bflr 31 |
253 | lbz 7,2(r4) |
254 | stb 7,2(r11) |
255 | blr |
256 | |
257 | .align 4 |
258 | L(tail5): |
259 | bflr 31 |
260 | lbz 6,4(r4) |
261 | stb 6,4(r11) |
262 | blr |
263 | |
264 | .align 4 |
265 | 1: |
266 | bflr 31 |
267 | lbz 6,0(r4) |
268 | stb 6,0(r11) |
269 | /* Return original DST pointer. */ |
270 | blr |
271 | |
272 | /* Handles copies of 0~8 bytes. */ |
273 | .align 4 |
274 | L(copy_LE_8): |
275 | bne cr6,L(tail4) |
276 | |
277 | /* Though we could've used ld/std here, they are still |
278 | slow for unaligned cases. */ |
279 | |
280 | lwz 6,0(r4) |
281 | lwz 7,4(r4) |
282 | stw 6,0(r11) |
283 | stw 7,4(r11) |
284 | blr |
285 | |
286 | |
287 | /* Handle copies of 32+ bytes where DST is aligned (to quadword) but |
288 | SRC is not. Use aligned quadword loads from SRC, shifted to realign |
289 | the data, allowing for aligned DST stores. */ |
290 | .align 4 |
291 | L(copy_GE_32_unaligned): |
292 | clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */ |
293 | srdi 9,r5,4 /* Number of full quadwords remaining. */ |
294 | |
295 | beq L(copy_GE_32_unaligned_cont) |
296 | |
297 | /* DST is not quadword aligned, get it aligned. */ |
298 | |
299 | mtocrf 0x01,0 |
300 | subf r5,0,r5 |
301 | |
302 | /* Vector instructions work best when proper alignment (16-bytes) |
303 | is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ |
304 | 1: |
305 | bf 31,2f |
306 | lbz 6,0(r4) |
307 | addi r4,r4,1 |
308 | stb 6,0(r11) |
309 | addi r11,r11,1 |
310 | 2: |
311 | bf 30,4f |
312 | lhz 6,0(r4) |
313 | addi r4,r4,2 |
314 | sth 6,0(r11) |
315 | addi r11,r11,2 |
316 | 4: |
317 | bf 29,8f |
318 | lwz 6,0(r4) |
319 | addi r4,r4,4 |
320 | stw 6,0(r11) |
321 | addi r11,r11,4 |
322 | 8: |
323 | bf 28,0f |
324 | ld 6,0(r4) |
325 | addi r4,r4,8 |
326 | std 6,0(r11) |
327 | addi r11,r11,8 |
328 | 0: |
329 | srdi 9,r5,4 /* Number of full quadwords remaining. */ |
330 | |
331 | /* The proper alignment is present, it is OK to copy the bytes now. */ |
332 | L(copy_GE_32_unaligned_cont): |
333 | |
334 | /* Setup two indexes to speed up the indexed vector operations. */ |
335 | clrldi 10,r5,60 |
336 | li 6,16 /* Index for 16-bytes offsets. */ |
337 | li 7,32 /* Index for 32-bytes offsets. */ |
338 | cmpldi cr1,10,0 |
339 | srdi 8,r5,5 /* Setup the loop counter. */ |
340 | mtocrf 0x01,9 |
341 | cmpldi cr6,9,1 |
342 | #ifdef __LITTLE_ENDIAN__ |
343 | lvsr 5,0,r4 |
344 | #else |
345 | lvsl 5,0,r4 |
346 | #endif |
347 | lvx 3,0,r4 |
348 | li 0,0 |
349 | bf 31,L(setup_unaligned_loop) |
350 | |
351 | /* Copy another 16 bytes to align to 32-bytes due to the loop. */ |
352 | lvx 4,r4,6 |
353 | #ifdef __LITTLE_ENDIAN__ |
354 | vperm 6,4,3,5 |
355 | #else |
356 | vperm 6,3,4,5 |
357 | #endif |
358 | addi r4,r4,16 |
359 | stvx 6,0,r11 |
360 | addi r11,r11,16 |
361 | vor 3,4,4 |
362 | clrrdi 0,r4,60 |
363 | |
364 | L(setup_unaligned_loop): |
365 | mtctr 8 |
366 | ble cr6,L(end_unaligned_loop) |
367 | |
368 | /* Copy 32 bytes at a time using vector instructions. */ |
369 | .align 4 |
370 | L(unaligned_loop): |
371 | |
372 | /* Note: vr6/vr10 may contain data that was already copied, |
373 | but in order to get proper alignment, we may have to copy |
374 | some portions again. This is faster than having unaligned |
375 | vector instructions though. */ |
376 | |
377 | lvx 4,r4,6 |
378 | #ifdef __LITTLE_ENDIAN__ |
379 | vperm 6,4,3,5 |
380 | #else |
381 | vperm 6,3,4,5 |
382 | #endif |
383 | lvx 3,r4,7 |
384 | #ifdef __LITTLE_ENDIAN__ |
385 | vperm 10,3,4,5 |
386 | #else |
387 | vperm 10,4,3,5 |
388 | #endif |
389 | addi r4,r4,32 |
390 | stvx 6,0,r11 |
391 | stvx 10,r11,6 |
392 | addi r11,r11,32 |
393 | bdnz L(unaligned_loop) |
394 | |
395 | clrrdi 0,r4,60 |
396 | |
397 | .align 4 |
398 | L(end_unaligned_loop): |
399 | |
400 | /* Check for tail bytes. */ |
401 | mtocrf 0x01,r5 |
402 | beqlr cr1 |
403 | |
404 | add r4,r4,0 |
405 | |
406 | /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ |
407 | /* Copy 8 bytes. */ |
408 | bf 28,4f |
409 | lwz 6,0(r4) |
410 | lwz 7,4(r4) |
411 | addi r4,r4,8 |
412 | stw 6,0(r11) |
413 | stw 7,4(r11) |
414 | addi r11,r11,8 |
415 | 4: /* Copy 4~7 bytes. */ |
416 | bf 29,L(tail2) |
417 | lwz 6,0(r4) |
418 | stw 6,0(r11) |
419 | bf 30,L(tail5) |
420 | lhz 7,4(r4) |
421 | sth 7,4(r11) |
422 | bflr 31 |
423 | lbz 8,6(r4) |
424 | stb 8,6(r11) |
425 | /* Return original DST pointer. */ |
426 | blr |
427 | |
428 | /* Start to memcpy backward implementation: the algorithm first check if |
429 | src and dest have the same alignment and if it does align both to 16 |
430 | bytes and copy using VSX instructions. |
431 | If does not, align dest to 16 bytes and use VMX (altivec) instruction |
432 | to read two 16 bytes at time, shift/permute the bytes read and write |
433 | aligned to dest. */ |
434 | L(memmove_bwd): |
435 | cmpldi cr1,r5,31 |
436 | /* Copy is done backwards: update the pointers and check alignment. */ |
437 | add r11,r3,r5 |
438 | add r4,r4,r5 |
439 | mr r0,r11 |
440 | ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move |
441 | code. */ |
442 | |
443 | andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */ |
444 | clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */ |
445 | cmpld cr6,r10,r9 /* SRC and DST alignments match? */ |
446 | |
447 | bne cr6,L(copy_GE_32_unaligned_bwd) |
448 | beq L(aligned_copy_bwd) |
449 | |
450 | mtocrf 0x01,r0 |
451 | clrldi r0,r0,60 |
452 | |
453 | /* Get the DST and SRC aligned to 16 bytes. */ |
454 | 1: |
455 | bf 31,2f |
456 | lbz r6,-1(r4) |
457 | subi r4,r4,1 |
458 | stb r6,-1(r11) |
459 | subi r11,r11,1 |
460 | 2: |
461 | bf 30,4f |
462 | lhz r6,-2(r4) |
463 | subi r4,r4,2 |
464 | sth r6,-2(r11) |
465 | subi r11,r11,2 |
466 | 4: |
467 | bf 29,8f |
468 | lwz r6,-4(r4) |
469 | subi r4,r4,4 |
470 | stw r6,-4(r11) |
471 | subi r11,r11,4 |
472 | 8: |
473 | bf 28,16f |
474 | ld r6,-8(r4) |
475 | subi r4,r4,8 |
476 | std r6,-8(r11) |
477 | subi r11,r11,8 |
478 | 16: |
479 | subf r5,0,r5 |
480 | |
481 | /* Main aligned copy loop. Copies 128 bytes at a time. */ |
482 | L(aligned_copy_bwd): |
483 | li r6,-16 |
484 | li r7,-32 |
485 | li r8,-48 |
486 | li r9,-64 |
487 | mtocrf 0x02,r5 |
488 | srdi r12,r5,7 |
489 | cmpdi r12,0 |
490 | beq L(aligned_tail_bwd) |
491 | lvx v6,r4,r6 |
492 | lvx v7,r4,r7 |
493 | mtctr 12 |
494 | b L(aligned_128loop_bwd) |
495 | |
496 | .align 4 |
497 | L(aligned_128head_bwd): |
498 | /* for the 2nd + iteration of this loop. */ |
499 | lvx v6,r4,r6 |
500 | lvx v7,r4,r7 |
501 | L(aligned_128loop_bwd): |
502 | lvx v8,r4,r8 |
503 | lvx v9,r4,r9 |
504 | stvx v6,r11,r6 |
505 | subi r4,r4,64 |
506 | stvx v7,r11,r7 |
507 | stvx v8,r11,r8 |
508 | stvx v9,r11,r9 |
509 | lvx v6,r4,r6 |
510 | lvx v7,r4,7 |
511 | subi r11,r11,64 |
512 | lvx v8,r4,r8 |
513 | lvx v9,r4,r9 |
514 | subi r4,r4,64 |
515 | stvx v6,r11,r6 |
516 | stvx v7,r11,r7 |
517 | stvx v8,r11,r8 |
518 | stvx v9,r11,r9 |
519 | subi r11,r11,64 |
520 | bdnz L(aligned_128head_bwd) |
521 | |
522 | L(aligned_tail_bwd): |
523 | mtocrf 0x01,r5 |
524 | bf 25,32f |
525 | lvx v6,r4,r6 |
526 | lvx v7,r4,r7 |
527 | lvx v8,r4,r8 |
528 | lvx v9,r4,r9 |
529 | subi r4,r4,64 |
530 | stvx v6,r11,r6 |
531 | stvx v7,r11,r7 |
532 | stvx v8,r11,r8 |
533 | stvx v9,r11,r9 |
534 | subi r11,r11,64 |
535 | 32: |
536 | bf 26,16f |
537 | lvx v6,r4,r6 |
538 | lvx v7,r4,r7 |
539 | subi r4,r4,32 |
540 | stvx v6,r11,r6 |
541 | stvx v7,r11,r7 |
542 | subi r11,r11,32 |
543 | 16: |
544 | bf 27,8f |
545 | lvx v6,r4,r6 |
546 | subi r4,r4,16 |
547 | stvx v6,r11,r6 |
548 | subi r11,r11,16 |
549 | 8: |
550 | bf 28,4f |
551 | ld r6,-8(r4) |
552 | subi r4,r4,8 |
553 | std r6,-8(r11) |
554 | subi r11,r11,8 |
555 | 4: /* Copies 4~7 bytes. */ |
556 | bf 29,L(tail2_bwd) |
557 | lwz r6,-4(r4) |
558 | stw r6,-4(r11) |
559 | bf 30,L(tail5_bwd) |
560 | lhz r7,-6(r4) |
561 | sth r7,-6(r11) |
562 | bflr 31 |
563 | lbz r8,-7(r4) |
564 | stb r8,-7(r11) |
565 | /* Return original DST pointer. */ |
566 | blr |
567 | |
568 | /* Handle copies of 0~31 bytes. */ |
569 | .align 4 |
570 | L(copy_LT_32_bwd): |
571 | cmpldi cr6,r5,8 |
572 | mtocrf 0x01,r5 |
573 | ble cr6,L(copy_LE_8_bwd) |
574 | |
575 | /* At least 9 bytes to go. */ |
576 | neg r8,r4 |
577 | andi. r0,r8,3 |
578 | cmpldi cr1,r5,16 |
579 | beq L(copy_LT_32_aligned_bwd) |
580 | |
581 | /* Force 4-byte alignment for SRC. */ |
582 | mtocrf 0x01,0 |
583 | subf r5,0,r5 |
584 | 2: |
585 | bf 30,1f |
586 | lhz r6,-2(r4) |
587 | subi r4,r4,2 |
588 | sth r6,-2(r11) |
589 | subi r11,r11,2 |
590 | 1: |
591 | bf 31,L(end_4bytes_alignment_bwd) |
592 | lbz 6,-1(r4) |
593 | subi r4,r4,1 |
594 | stb 6,-1(r11) |
595 | subi r11,r11,1 |
596 | |
597 | .align 4 |
598 | L(end_4bytes_alignment_bwd): |
599 | cmpldi cr1,r5,16 |
600 | mtocrf 0x01,r5 |
601 | |
602 | L(copy_LT_32_aligned_bwd): |
603 | /* At least 6 bytes to go, and SRC is word-aligned. */ |
604 | blt cr1,8f |
605 | |
606 | /* Copy 16 bytes. */ |
607 | lwz r6,-4(r4) |
608 | lwz r7,-8(r4) |
609 | stw r6,-4(r11) |
610 | lwz r8,-12(r4) |
611 | stw r7,-8(r11) |
612 | lwz r6,-16(r4) |
613 | subi r4,r4,16 |
614 | stw r8,-12(r11) |
615 | stw r6,-16(r11) |
616 | subi r11,r11,16 |
617 | 8: /* Copy 8 bytes. */ |
618 | bf 28,L(tail4_bwd) |
619 | lwz r6,-4(r4) |
620 | lwz r7,-8(r4) |
621 | subi r4,r4,8 |
622 | stw r6,-4(r11) |
623 | stw r7,-8(r11) |
624 | subi r11,r11,8 |
625 | |
626 | .align 4 |
627 | /* Copies 4~7 bytes. */ |
628 | L(tail4_bwd): |
629 | bf 29,L(tail2_bwd) |
630 | lwz 6,-4(r4) |
631 | stw 6,-4(r11) |
632 | bf 30,L(tail5_bwd) |
633 | lhz 7,-6(r4) |
634 | sth 7,-6(r11) |
635 | bflr 31 |
636 | lbz 8,-7(r4) |
637 | stb 8,-7(r11) |
638 | /* Return original DST pointer. */ |
639 | blr |
640 | |
641 | .align 4 |
642 | /* Copies 2~3 bytes. */ |
643 | L(tail2_bwd): |
644 | bf 30,1f |
645 | lhz 6,-2(r4) |
646 | sth 6,-2(r11) |
647 | bflr 31 |
648 | lbz 7,-3(r4) |
649 | stb 7,-3(r11) |
650 | blr |
651 | |
652 | .align 4 |
653 | L(tail5_bwd): |
654 | bflr 31 |
655 | lbz 6,-5(r4) |
656 | stb 6,-5(r11) |
657 | blr |
658 | |
659 | .align 4 |
660 | 1: |
661 | bflr 31 |
662 | lbz 6,-1(r4) |
663 | stb 6,-1(r11) |
664 | /* Return original DST pointer. */ |
665 | blr |
666 | |
667 | |
668 | /* Handles copies of 0~8 bytes. */ |
669 | .align 4 |
670 | L(copy_LE_8_bwd): |
671 | bne cr6,L(tail4_bwd) |
672 | |
673 | /* Though we could've used ld/std here, they are still |
674 | slow for unaligned cases. */ |
675 | lwz 6,-8(r4) |
676 | lwz 7,-4(r4) |
677 | stw 6,-8(r11) |
678 | stw 7,-4(r11) |
679 | blr |
680 | |
681 | |
682 | /* Handle copies of 32+ bytes where DST is aligned (to quadword) but |
683 | SRC is not. Use aligned quadword loads from SRC, shifted to realign |
684 | the data, allowing for aligned DST stores. */ |
685 | .align 4 |
686 | L(copy_GE_32_unaligned_bwd): |
687 | andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */ |
688 | srdi r9,r5,4 /* Number of full quadwords remaining. */ |
689 | |
690 | beq L(copy_GE_32_unaligned_cont_bwd) |
691 | |
692 | /* DST is not quadword aligned and r10 holds the address masked to |
693 | compare alignments. */ |
694 | mtocrf 0x01,r10 |
695 | subf r5,r10,r5 |
696 | |
697 | /* Vector instructions work best when proper alignment (16-bytes) |
698 | is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ |
699 | 1: |
700 | bf 31,2f |
701 | lbz r6,-1(r4) |
702 | subi r4,r4,1 |
703 | stb r6,-1(r11) |
704 | subi r11,r11,1 |
705 | 2: |
706 | bf 30,4f |
707 | lhz r6,-2(r4) |
708 | subi r4,r4,2 |
709 | sth r6,-2(r11) |
710 | subi r11,r11,2 |
711 | 4: |
712 | bf 29,8f |
713 | lwz r6,-4(r4) |
714 | subi r4,r4,4 |
715 | stw r6,-4(r11) |
716 | subi r11,r11,4 |
717 | 8: |
718 | bf 28,0f |
719 | ld r6,-8(r4) |
720 | subi r4,r4,8 |
721 | std r6,-8(r11) |
722 | subi r11,r11,8 |
723 | 0: |
724 | srdi r9,r5,4 /* Number of full quadwords remaining. */ |
725 | |
726 | /* The proper alignment is present, it is OK to copy the bytes now. */ |
727 | L(copy_GE_32_unaligned_cont_bwd): |
728 | |
729 | /* Setup two indexes to speed up the indexed vector operations. */ |
730 | clrldi r10,r5,60 |
731 | li r6,-16 /* Index for 16-bytes offsets. */ |
732 | li r7,-32 /* Index for 32-bytes offsets. */ |
733 | cmpldi cr1,10,0 |
734 | srdi r8,r5,5 /* Setup the loop counter. */ |
735 | mtocrf 0x01,9 |
736 | cmpldi cr6,r9,1 |
737 | #ifdef __LITTLE_ENDIAN__ |
738 | lvsr v5,r0,r4 |
739 | #else |
740 | lvsl v5,r0,r4 |
741 | #endif |
742 | lvx v3,0,r4 |
743 | li r0,0 |
744 | bf 31,L(setup_unaligned_loop_bwd) |
745 | |
746 | /* Copy another 16 bytes to align to 32-bytes due to the loop. */ |
747 | lvx v4,r4,r6 |
748 | #ifdef __LITTLE_ENDIAN__ |
749 | vperm v6,v3,v4,v5 |
750 | #else |
751 | vperm v6,v4,v3,v5 |
752 | #endif |
753 | subi r4,r4,16 |
754 | stvx v6,r11,r6 |
755 | subi r11,r11,16 |
756 | vor v3,v4,v4 |
757 | clrrdi r0,r4,60 |
758 | |
759 | L(setup_unaligned_loop_bwd): |
760 | mtctr r8 |
761 | ble cr6,L(end_unaligned_loop_bwd) |
762 | |
763 | /* Copy 32 bytes at a time using vector instructions. */ |
764 | .align 4 |
765 | L(unaligned_loop_bwd): |
766 | |
767 | /* Note: vr6/vr10 may contain data that was already copied, |
768 | but in order to get proper alignment, we may have to copy |
769 | some portions again. This is faster than having unaligned |
770 | vector instructions though. */ |
771 | |
772 | lvx v4,r4,r6 |
773 | #ifdef __LITTLE_ENDIAN__ |
774 | vperm v6,v3,v4,v5 |
775 | #else |
776 | vperm v6,v4,v3,v5 |
777 | #endif |
778 | lvx v3,r4,r7 |
779 | #ifdef __LITTLE_ENDIAN__ |
780 | vperm v10,v4,v3,v5 |
781 | #else |
782 | vperm v10,v3,v4,v5 |
783 | #endif |
784 | subi r4,r4,32 |
785 | stvx v6,r11,r6 |
786 | stvx v10,r11,r7 |
787 | subi r11,r11,32 |
788 | bdnz L(unaligned_loop_bwd) |
789 | |
790 | clrrdi r0,r4,60 |
791 | |
792 | .align 4 |
793 | L(end_unaligned_loop_bwd): |
794 | |
795 | /* Check for tail bytes. */ |
796 | mtocrf 0x01,r5 |
797 | beqlr cr1 |
798 | |
799 | add r4,r4,0 |
800 | |
801 | /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ |
802 | /* Copy 8 bytes. */ |
803 | bf 28,4f |
804 | lwz r6,-4(r4) |
805 | lwz r7,-8(r4) |
806 | subi r4,r4,8 |
807 | stw r6,-4(r11) |
808 | stw r7,-8(r11) |
809 | subi r11,r11,8 |
810 | 4: /* Copy 4~7 bytes. */ |
811 | bf 29,L(tail2_bwd) |
812 | lwz r6,-4(r4) |
813 | stw r6,-4(r11) |
814 | bf 30,L(tail5_bwd) |
815 | lhz r7,-6(r4) |
816 | sth r7,-6(r11) |
817 | bflr 31 |
818 | lbz r8,-7(r4) |
819 | stb r8,-7(r11) |
820 | /* Return original DST pointer. */ |
821 | blr |
822 | END_GEN_TB (MEMMOVE, TB_TOCLESS) |
823 | libc_hidden_builtin_def (memmove) |
824 | |