1/* Optimized memmove implementation for PowerPC64/POWER7.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21
22/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
23
24 This optimization check if memory 'dest' overlaps with 'src'. If it does
25 not then it calls an optimized memcpy call (similar to memcpy for POWER7,
26 embedded here to gain some cycles).
27 If source and destiny overlaps, a optimized backwards memcpy is used
28 instead. */
29
30#ifndef MEMMOVE
31# define MEMMOVE memmove
32#endif
33 .machine power7
34ENTRY_TOCLESS (MEMMOVE, 5)
35 CALL_MCOUNT 3
36
37L(_memmove):
38 subf r9,r4,r3
39 cmpld cr7,r9,r5
40 blt cr7,L(memmove_bwd)
41
42 cmpldi cr1,r5,31
43 neg 0,3
44 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
45 code. */
46
47 andi. 10,3,15
48 clrldi 11,4,60
49 cmpld cr6,10,11 /* SRC and DST alignments match? */
50
51 mr r11,3
52 bne cr6,L(copy_GE_32_unaligned)
53 beq L(aligned_copy)
54
55 mtocrf 0x01,0
56 clrldi 0,0,60
57
58/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
591:
60 bf 31,2f
61 lbz 6,0(r4)
62 addi r4,r4,1
63 stb 6,0(r11)
64 addi r11,r11,1
652:
66 bf 30,4f
67 lhz 6,0(r4)
68 addi r4,r4,2
69 sth 6,0(r11)
70 addi r11,r11,2
714:
72 bf 29,8f
73 lwz 6,0(r4)
74 addi r4,r4,4
75 stw 6,0(r11)
76 addi r11,r11,4
778:
78 bf 28,16f
79 ld 6,0(r4)
80 addi r4,r4,8
81 std 6,0(r11)
82 addi r11,r11,8
8316:
84 subf r5,0,r5
85
86/* Main aligned copy loop. Copies 128 bytes at a time. */
87L(aligned_copy):
88 li 6,16
89 li 7,32
90 li 8,48
91 mtocrf 0x02,r5
92 srdi 12,r5,7
93 cmpdi 12,0
94 beq L(aligned_tail)
95 lvx 6,0,r4
96 lvx 7,r4,6
97 mtctr 12
98 b L(aligned_128loop)
99
100 .align 4
101L(aligned_128head):
102 /* for the 2nd + iteration of this loop. */
103 lvx 6,0,r4
104 lvx 7,r4,6
105L(aligned_128loop):
106 lvx 8,r4,7
107 lvx 9,r4,8
108 stvx 6,0,r11
109 addi r4,r4,64
110 stvx 7,r11,6
111 stvx 8,r11,7
112 stvx 9,r11,8
113 lvx 6,0,r4
114 lvx 7,r4,6
115 addi r11,r11,64
116 lvx 8,r4,7
117 lvx 9,r4,8
118 addi r4,r4,64
119 stvx 6,0,r11
120 stvx 7,r11,6
121 stvx 8,r11,7
122 stvx 9,r11,8
123 addi r11,r11,64
124 bdnz L(aligned_128head)
125
126L(aligned_tail):
127 mtocrf 0x01,r5
128 bf 25,32f
129 lvx 6,0,r4
130 lvx 7,r4,6
131 lvx 8,r4,7
132 lvx 9,r4,8
133 addi r4,r4,64
134 stvx 6,0,r11
135 stvx 7,r11,6
136 stvx 8,r11,7
137 stvx 9,r11,8
138 addi r11,r11,64
13932:
140 bf 26,16f
141 lvx 6,0,r4
142 lvx 7,r4,6
143 addi r4,r4,32
144 stvx 6,0,r11
145 stvx 7,r11,6
146 addi r11,r11,32
14716:
148 bf 27,8f
149 lvx 6,0,r4
150 addi r4,r4,16
151 stvx 6,0,r11
152 addi r11,r11,16
1538:
154 bf 28,4f
155 ld 6,0(r4)
156 addi r4,r4,8
157 std 6,0(r11)
158 addi r11,r11,8
1594: /* Copies 4~7 bytes. */
160 bf 29,L(tail2)
161 lwz 6,0(r4)
162 stw 6,0(r11)
163 bf 30,L(tail5)
164 lhz 7,4(r4)
165 sth 7,4(r11)
166 bflr 31
167 lbz 8,6(r4)
168 stb 8,6(r11)
169 /* Return original DST pointer. */
170 blr
171
172/* Handle copies of 0~31 bytes. */
173 .align 4
174L(copy_LT_32):
175 mr r11,3
176 cmpldi cr6,r5,8
177 mtocrf 0x01,r5
178 ble cr6,L(copy_LE_8)
179
180 /* At least 9 bytes to go. */
181 neg 8,4
182 andi. 0,8,3
183 cmpldi cr1,r5,16
184 beq L(copy_LT_32_aligned)
185
186 /* Force 4-byte alignment for SRC. */
187 mtocrf 0x01,0
188 subf r5,0,r5
1892:
190 bf 30,1f
191 lhz 6,0(r4)
192 addi r4,r4,2
193 sth 6,0(r11)
194 addi r11,r11,2
1951:
196 bf 31,L(end_4bytes_alignment)
197 lbz 6,0(r4)
198 addi r4,r4,1
199 stb 6,0(r11)
200 addi r11,r11,1
201
202 .align 4
203L(end_4bytes_alignment):
204 cmpldi cr1,r5,16
205 mtocrf 0x01,r5
206
207L(copy_LT_32_aligned):
208 /* At least 6 bytes to go, and SRC is word-aligned. */
209 blt cr1,8f
210
211 /* Copy 16 bytes. */
212 lwz 6,0(r4)
213 lwz 7,4(r4)
214 stw 6,0(r11)
215 lwz 8,8(r4)
216 stw 7,4(r11)
217 lwz 6,12(r4)
218 addi r4,r4,16
219 stw 8,8(r11)
220 stw 6,12(r11)
221 addi r11,r11,16
2228: /* Copy 8 bytes. */
223 bf 28,L(tail4)
224 lwz 6,0(r4)
225 lwz 7,4(r4)
226 addi r4,r4,8
227 stw 6,0(r11)
228 stw 7,4(r11)
229 addi r11,r11,8
230
231 .align 4
232/* Copies 4~7 bytes. */
233L(tail4):
234 bf 29,L(tail2)
235 lwz 6,0(r4)
236 stw 6,0(r11)
237 bf 30,L(tail5)
238 lhz 7,4(r4)
239 sth 7,4(r11)
240 bflr 31
241 lbz 8,6(r4)
242 stb 8,6(r11)
243 /* Return original DST pointer. */
244 blr
245
246 .align 4
247/* Copies 2~3 bytes. */
248L(tail2):
249 bf 30,1f
250 lhz 6,0(r4)
251 sth 6,0(r11)
252 bflr 31
253 lbz 7,2(r4)
254 stb 7,2(r11)
255 blr
256
257 .align 4
258L(tail5):
259 bflr 31
260 lbz 6,4(r4)
261 stb 6,4(r11)
262 blr
263
264 .align 4
2651:
266 bflr 31
267 lbz 6,0(r4)
268 stb 6,0(r11)
269 /* Return original DST pointer. */
270 blr
271
272/* Handles copies of 0~8 bytes. */
273 .align 4
274L(copy_LE_8):
275 bne cr6,L(tail4)
276
277 /* Though we could've used ld/std here, they are still
278 slow for unaligned cases. */
279
280 lwz 6,0(r4)
281 lwz 7,4(r4)
282 stw 6,0(r11)
283 stw 7,4(r11)
284 blr
285
286
287/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
288 SRC is not. Use aligned quadword loads from SRC, shifted to realign
289 the data, allowing for aligned DST stores. */
290 .align 4
291L(copy_GE_32_unaligned):
292 clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */
293 srdi 9,r5,4 /* Number of full quadwords remaining. */
294
295 beq L(copy_GE_32_unaligned_cont)
296
297 /* DST is not quadword aligned, get it aligned. */
298
299 mtocrf 0x01,0
300 subf r5,0,r5
301
302 /* Vector instructions work best when proper alignment (16-bytes)
303 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3041:
305 bf 31,2f
306 lbz 6,0(r4)
307 addi r4,r4,1
308 stb 6,0(r11)
309 addi r11,r11,1
3102:
311 bf 30,4f
312 lhz 6,0(r4)
313 addi r4,r4,2
314 sth 6,0(r11)
315 addi r11,r11,2
3164:
317 bf 29,8f
318 lwz 6,0(r4)
319 addi r4,r4,4
320 stw 6,0(r11)
321 addi r11,r11,4
3228:
323 bf 28,0f
324 ld 6,0(r4)
325 addi r4,r4,8
326 std 6,0(r11)
327 addi r11,r11,8
3280:
329 srdi 9,r5,4 /* Number of full quadwords remaining. */
330
331 /* The proper alignment is present, it is OK to copy the bytes now. */
332L(copy_GE_32_unaligned_cont):
333
334 /* Setup two indexes to speed up the indexed vector operations. */
335 clrldi 10,r5,60
336 li 6,16 /* Index for 16-bytes offsets. */
337 li 7,32 /* Index for 32-bytes offsets. */
338 cmpldi cr1,10,0
339 srdi 8,r5,5 /* Setup the loop counter. */
340 mtocrf 0x01,9
341 cmpldi cr6,9,1
342#ifdef __LITTLE_ENDIAN__
343 lvsr 5,0,r4
344#else
345 lvsl 5,0,r4
346#endif
347 lvx 3,0,r4
348 li 0,0
349 bf 31,L(setup_unaligned_loop)
350
351 /* Copy another 16 bytes to align to 32-bytes due to the loop. */
352 lvx 4,r4,6
353#ifdef __LITTLE_ENDIAN__
354 vperm 6,4,3,5
355#else
356 vperm 6,3,4,5
357#endif
358 addi r4,r4,16
359 stvx 6,0,r11
360 addi r11,r11,16
361 vor 3,4,4
362 clrrdi 0,r4,60
363
364L(setup_unaligned_loop):
365 mtctr 8
366 ble cr6,L(end_unaligned_loop)
367
368 /* Copy 32 bytes at a time using vector instructions. */
369 .align 4
370L(unaligned_loop):
371
372 /* Note: vr6/vr10 may contain data that was already copied,
373 but in order to get proper alignment, we may have to copy
374 some portions again. This is faster than having unaligned
375 vector instructions though. */
376
377 lvx 4,r4,6
378#ifdef __LITTLE_ENDIAN__
379 vperm 6,4,3,5
380#else
381 vperm 6,3,4,5
382#endif
383 lvx 3,r4,7
384#ifdef __LITTLE_ENDIAN__
385 vperm 10,3,4,5
386#else
387 vperm 10,4,3,5
388#endif
389 addi r4,r4,32
390 stvx 6,0,r11
391 stvx 10,r11,6
392 addi r11,r11,32
393 bdnz L(unaligned_loop)
394
395 clrrdi 0,r4,60
396
397 .align 4
398L(end_unaligned_loop):
399
400 /* Check for tail bytes. */
401 mtocrf 0x01,r5
402 beqlr cr1
403
404 add r4,r4,0
405
406 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
407 /* Copy 8 bytes. */
408 bf 28,4f
409 lwz 6,0(r4)
410 lwz 7,4(r4)
411 addi r4,r4,8
412 stw 6,0(r11)
413 stw 7,4(r11)
414 addi r11,r11,8
4154: /* Copy 4~7 bytes. */
416 bf 29,L(tail2)
417 lwz 6,0(r4)
418 stw 6,0(r11)
419 bf 30,L(tail5)
420 lhz 7,4(r4)
421 sth 7,4(r11)
422 bflr 31
423 lbz 8,6(r4)
424 stb 8,6(r11)
425 /* Return original DST pointer. */
426 blr
427
428 /* Start to memcpy backward implementation: the algorithm first check if
429 src and dest have the same alignment and if it does align both to 16
430 bytes and copy using VSX instructions.
431 If does not, align dest to 16 bytes and use VMX (altivec) instruction
432 to read two 16 bytes at time, shift/permute the bytes read and write
433 aligned to dest. */
434L(memmove_bwd):
435 cmpldi cr1,r5,31
436 /* Copy is done backwards: update the pointers and check alignment. */
437 add r11,r3,r5
438 add r4,r4,r5
439 mr r0,r11
440 ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move
441 code. */
442
443 andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */
444 clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */
445 cmpld cr6,r10,r9 /* SRC and DST alignments match? */
446
447 bne cr6,L(copy_GE_32_unaligned_bwd)
448 beq L(aligned_copy_bwd)
449
450 mtocrf 0x01,r0
451 clrldi r0,r0,60
452
453/* Get the DST and SRC aligned to 16 bytes. */
4541:
455 bf 31,2f
456 lbz r6,-1(r4)
457 subi r4,r4,1
458 stb r6,-1(r11)
459 subi r11,r11,1
4602:
461 bf 30,4f
462 lhz r6,-2(r4)
463 subi r4,r4,2
464 sth r6,-2(r11)
465 subi r11,r11,2
4664:
467 bf 29,8f
468 lwz r6,-4(r4)
469 subi r4,r4,4
470 stw r6,-4(r11)
471 subi r11,r11,4
4728:
473 bf 28,16f
474 ld r6,-8(r4)
475 subi r4,r4,8
476 std r6,-8(r11)
477 subi r11,r11,8
47816:
479 subf r5,0,r5
480
481/* Main aligned copy loop. Copies 128 bytes at a time. */
482L(aligned_copy_bwd):
483 li r6,-16
484 li r7,-32
485 li r8,-48
486 li r9,-64
487 mtocrf 0x02,r5
488 srdi r12,r5,7
489 cmpdi r12,0
490 beq L(aligned_tail_bwd)
491 lvx v6,r4,r6
492 lvx v7,r4,r7
493 mtctr 12
494 b L(aligned_128loop_bwd)
495
496 .align 4
497L(aligned_128head_bwd):
498 /* for the 2nd + iteration of this loop. */
499 lvx v6,r4,r6
500 lvx v7,r4,r7
501L(aligned_128loop_bwd):
502 lvx v8,r4,r8
503 lvx v9,r4,r9
504 stvx v6,r11,r6
505 subi r4,r4,64
506 stvx v7,r11,r7
507 stvx v8,r11,r8
508 stvx v9,r11,r9
509 lvx v6,r4,r6
510 lvx v7,r4,7
511 subi r11,r11,64
512 lvx v8,r4,r8
513 lvx v9,r4,r9
514 subi r4,r4,64
515 stvx v6,r11,r6
516 stvx v7,r11,r7
517 stvx v8,r11,r8
518 stvx v9,r11,r9
519 subi r11,r11,64
520 bdnz L(aligned_128head_bwd)
521
522L(aligned_tail_bwd):
523 mtocrf 0x01,r5
524 bf 25,32f
525 lvx v6,r4,r6
526 lvx v7,r4,r7
527 lvx v8,r4,r8
528 lvx v9,r4,r9
529 subi r4,r4,64
530 stvx v6,r11,r6
531 stvx v7,r11,r7
532 stvx v8,r11,r8
533 stvx v9,r11,r9
534 subi r11,r11,64
53532:
536 bf 26,16f
537 lvx v6,r4,r6
538 lvx v7,r4,r7
539 subi r4,r4,32
540 stvx v6,r11,r6
541 stvx v7,r11,r7
542 subi r11,r11,32
54316:
544 bf 27,8f
545 lvx v6,r4,r6
546 subi r4,r4,16
547 stvx v6,r11,r6
548 subi r11,r11,16
5498:
550 bf 28,4f
551 ld r6,-8(r4)
552 subi r4,r4,8
553 std r6,-8(r11)
554 subi r11,r11,8
5554: /* Copies 4~7 bytes. */
556 bf 29,L(tail2_bwd)
557 lwz r6,-4(r4)
558 stw r6,-4(r11)
559 bf 30,L(tail5_bwd)
560 lhz r7,-6(r4)
561 sth r7,-6(r11)
562 bflr 31
563 lbz r8,-7(r4)
564 stb r8,-7(r11)
565 /* Return original DST pointer. */
566 blr
567
568/* Handle copies of 0~31 bytes. */
569 .align 4
570L(copy_LT_32_bwd):
571 cmpldi cr6,r5,8
572 mtocrf 0x01,r5
573 ble cr6,L(copy_LE_8_bwd)
574
575 /* At least 9 bytes to go. */
576 neg r8,r4
577 andi. r0,r8,3
578 cmpldi cr1,r5,16
579 beq L(copy_LT_32_aligned_bwd)
580
581 /* Force 4-byte alignment for SRC. */
582 mtocrf 0x01,0
583 subf r5,0,r5
5842:
585 bf 30,1f
586 lhz r6,-2(r4)
587 subi r4,r4,2
588 sth r6,-2(r11)
589 subi r11,r11,2
5901:
591 bf 31,L(end_4bytes_alignment_bwd)
592 lbz 6,-1(r4)
593 subi r4,r4,1
594 stb 6,-1(r11)
595 subi r11,r11,1
596
597 .align 4
598L(end_4bytes_alignment_bwd):
599 cmpldi cr1,r5,16
600 mtocrf 0x01,r5
601
602L(copy_LT_32_aligned_bwd):
603 /* At least 6 bytes to go, and SRC is word-aligned. */
604 blt cr1,8f
605
606 /* Copy 16 bytes. */
607 lwz r6,-4(r4)
608 lwz r7,-8(r4)
609 stw r6,-4(r11)
610 lwz r8,-12(r4)
611 stw r7,-8(r11)
612 lwz r6,-16(r4)
613 subi r4,r4,16
614 stw r8,-12(r11)
615 stw r6,-16(r11)
616 subi r11,r11,16
6178: /* Copy 8 bytes. */
618 bf 28,L(tail4_bwd)
619 lwz r6,-4(r4)
620 lwz r7,-8(r4)
621 subi r4,r4,8
622 stw r6,-4(r11)
623 stw r7,-8(r11)
624 subi r11,r11,8
625
626 .align 4
627/* Copies 4~7 bytes. */
628L(tail4_bwd):
629 bf 29,L(tail2_bwd)
630 lwz 6,-4(r4)
631 stw 6,-4(r11)
632 bf 30,L(tail5_bwd)
633 lhz 7,-6(r4)
634 sth 7,-6(r11)
635 bflr 31
636 lbz 8,-7(r4)
637 stb 8,-7(r11)
638 /* Return original DST pointer. */
639 blr
640
641 .align 4
642/* Copies 2~3 bytes. */
643L(tail2_bwd):
644 bf 30,1f
645 lhz 6,-2(r4)
646 sth 6,-2(r11)
647 bflr 31
648 lbz 7,-3(r4)
649 stb 7,-3(r11)
650 blr
651
652 .align 4
653L(tail5_bwd):
654 bflr 31
655 lbz 6,-5(r4)
656 stb 6,-5(r11)
657 blr
658
659 .align 4
6601:
661 bflr 31
662 lbz 6,-1(r4)
663 stb 6,-1(r11)
664 /* Return original DST pointer. */
665 blr
666
667
668/* Handles copies of 0~8 bytes. */
669 .align 4
670L(copy_LE_8_bwd):
671 bne cr6,L(tail4_bwd)
672
673 /* Though we could've used ld/std here, they are still
674 slow for unaligned cases. */
675 lwz 6,-8(r4)
676 lwz 7,-4(r4)
677 stw 6,-8(r11)
678 stw 7,-4(r11)
679 blr
680
681
682/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
683 SRC is not. Use aligned quadword loads from SRC, shifted to realign
684 the data, allowing for aligned DST stores. */
685 .align 4
686L(copy_GE_32_unaligned_bwd):
687 andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */
688 srdi r9,r5,4 /* Number of full quadwords remaining. */
689
690 beq L(copy_GE_32_unaligned_cont_bwd)
691
692 /* DST is not quadword aligned and r10 holds the address masked to
693 compare alignments. */
694 mtocrf 0x01,r10
695 subf r5,r10,r5
696
697 /* Vector instructions work best when proper alignment (16-bytes)
698 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
6991:
700 bf 31,2f
701 lbz r6,-1(r4)
702 subi r4,r4,1
703 stb r6,-1(r11)
704 subi r11,r11,1
7052:
706 bf 30,4f
707 lhz r6,-2(r4)
708 subi r4,r4,2
709 sth r6,-2(r11)
710 subi r11,r11,2
7114:
712 bf 29,8f
713 lwz r6,-4(r4)
714 subi r4,r4,4
715 stw r6,-4(r11)
716 subi r11,r11,4
7178:
718 bf 28,0f
719 ld r6,-8(r4)
720 subi r4,r4,8
721 std r6,-8(r11)
722 subi r11,r11,8
7230:
724 srdi r9,r5,4 /* Number of full quadwords remaining. */
725
726 /* The proper alignment is present, it is OK to copy the bytes now. */
727L(copy_GE_32_unaligned_cont_bwd):
728
729 /* Setup two indexes to speed up the indexed vector operations. */
730 clrldi r10,r5,60
731 li r6,-16 /* Index for 16-bytes offsets. */
732 li r7,-32 /* Index for 32-bytes offsets. */
733 cmpldi cr1,10,0
734 srdi r8,r5,5 /* Setup the loop counter. */
735 mtocrf 0x01,9
736 cmpldi cr6,r9,1
737#ifdef __LITTLE_ENDIAN__
738 lvsr v5,r0,r4
739#else
740 lvsl v5,r0,r4
741#endif
742 lvx v3,0,r4
743 li r0,0
744 bf 31,L(setup_unaligned_loop_bwd)
745
746 /* Copy another 16 bytes to align to 32-bytes due to the loop. */
747 lvx v4,r4,r6
748#ifdef __LITTLE_ENDIAN__
749 vperm v6,v3,v4,v5
750#else
751 vperm v6,v4,v3,v5
752#endif
753 subi r4,r4,16
754 stvx v6,r11,r6
755 subi r11,r11,16
756 vor v3,v4,v4
757 clrrdi r0,r4,60
758
759L(setup_unaligned_loop_bwd):
760 mtctr r8
761 ble cr6,L(end_unaligned_loop_bwd)
762
763 /* Copy 32 bytes at a time using vector instructions. */
764 .align 4
765L(unaligned_loop_bwd):
766
767 /* Note: vr6/vr10 may contain data that was already copied,
768 but in order to get proper alignment, we may have to copy
769 some portions again. This is faster than having unaligned
770 vector instructions though. */
771
772 lvx v4,r4,r6
773#ifdef __LITTLE_ENDIAN__
774 vperm v6,v3,v4,v5
775#else
776 vperm v6,v4,v3,v5
777#endif
778 lvx v3,r4,r7
779#ifdef __LITTLE_ENDIAN__
780 vperm v10,v4,v3,v5
781#else
782 vperm v10,v3,v4,v5
783#endif
784 subi r4,r4,32
785 stvx v6,r11,r6
786 stvx v10,r11,r7
787 subi r11,r11,32
788 bdnz L(unaligned_loop_bwd)
789
790 clrrdi r0,r4,60
791
792 .align 4
793L(end_unaligned_loop_bwd):
794
795 /* Check for tail bytes. */
796 mtocrf 0x01,r5
797 beqlr cr1
798
799 add r4,r4,0
800
801 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
802 /* Copy 8 bytes. */
803 bf 28,4f
804 lwz r6,-4(r4)
805 lwz r7,-8(r4)
806 subi r4,r4,8
807 stw r6,-4(r11)
808 stw r7,-8(r11)
809 subi r11,r11,8
8104: /* Copy 4~7 bytes. */
811 bf 29,L(tail2_bwd)
812 lwz r6,-4(r4)
813 stw r6,-4(r11)
814 bf 30,L(tail5_bwd)
815 lhz r7,-6(r4)
816 sth r7,-6(r11)
817 bflr 31
818 lbz r8,-7(r4)
819 stb r8,-7(r11)
820 /* Return original DST pointer. */
821 blr
822END_GEN_TB (MEMMOVE, TB_TOCLESS)
823libc_hidden_builtin_def (memmove)
824

source code of glibc/sysdeps/powerpc/powerpc64/power7/memmove.S