1/* Optimized memcpy implementation for PowerPC64.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22 Returns 'dst'.
23
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
28 The 64-bit implementations of POWER3 and POWER4 do a reasonable job
29 of handling unaligned load/stores that do not cross 32-byte boundaries.
30
31 Longer moves (>= 32-bytes) justify the effort to get at least the
32 destination doubleword (8-byte) aligned. Further optimization is
33 possible when both source and destination are doubleword aligned.
34 Each case has a optimized unrolled loop.
35
36 For POWER6 unaligned loads will take a 20+ cycle hiccup for any
37 L1 cache miss that crosses a 32- or 128-byte boundary. Store
38 is more forgiving and does not take a hiccup until page or
39 segment boundaries. So we require doubleword alignment for
40 the source but may take a risk and only require word alignment
41 for the destination. */
42
43#ifndef MEMCPY
44# define MEMCPY memcpy
45#endif
46 .machine "power6"
47ENTRY_TOCLESS (MEMCPY, 7)
48 CALL_MCOUNT 3
49
50 cmpldi cr1,5,31
51 neg 0,3
52 std 3,-16(1)
53 std 31,-8(1)
54 andi. 11,3,7 /* check alignment of dst. */
55 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
56 clrldi 10,4,61 /* check alignment of src. */
57 cmpldi cr6,5,8
58 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
59 mtcrf 0x01,0
60 cmpld cr6,10,11
61 srdi 9,5,3 /* Number of full double words remaining. */
62 beq .L0
63
64 subf 5,0,5
65 /* Move 0-7 bytes as needed to get the destination doubleword aligned.
66 Duplicate some code to maximize fall-through and minimize agen delays. */
671: bf 31,2f
68 lbz 6,0(4)
69 stb 6,0(3)
70 bf 30,5f
71 lhz 6,1(4)
72 sth 6,1(3)
73 bf 29,0f
74 lwz 6,3(4)
75 stw 6,3(3)
76 b 0f
775:
78 bf 29,0f
79 lwz 6,1(4)
80 stw 6,1(3)
81 b 0f
82
832: bf 30,4f
84 lhz 6,0(4)
85 sth 6,0(3)
86 bf 29,0f
87 lwz 6,2(4)
88 stw 6,2(3)
89 b 0f
90
914: bf 29,0f
92 lwz 6,0(4)
93 stw 6,0(3)
940:
95/* Add the number of bytes until the 1st doubleword of dst to src and dst. */
96 add 4,4,0
97 add 3,3,0
98
99 clrldi 10,4,61 /* check alignment of src again. */
100 srdi 9,5,3 /* Number of full double words remaining. */
101
102 /* Copy doublewords from source to destination, assuming the
103 destination is aligned on a doubleword boundary.
104
105 At this point we know there are at least 25 bytes left (32-7) to copy.
106 The next step is to determine if the source is also doubleword aligned.
107 If not branch to the unaligned move code at .L6. which uses
108 a load, shift, store strategy.
109
110 Otherwise source and destination are doubleword aligned, and we can
111 the optimized doubleword copy loop. */
112 .align 4
113.L0:
114 clrldi 11,5,61
115 andi. 0,5,0x78
116 srdi 12,5,7 /* Number of 128-byte blocks to move. */
117 cmpldi cr1,11,0 /* If the tail is 0 bytes */
118 bne- cr6,.L6 /* If source is not DW aligned. */
119
120 /* Move doublewords where destination and source are DW aligned.
121 Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
122 If the copy is not an exact multiple of 128 bytes, 1-15
123 doublewords are copied as needed to set up the main loop. After
124 the main loop exits there may be a tail of 1-7 bytes. These byte
125 are copied a word/halfword/byte at a time as needed to preserve
126 alignment.
127
128 For POWER6 the L1 is store-through and the L2 is store-in. The
129 L2 is clocked at half CPU clock so we can store 16 bytes every
130 other cycle. POWER6 also has a load/store bypass so we can do
131 load, load, store, store every 2 cycles.
132
133 The following code is sensitive to cache line alignment. Do not
134 make any change with out first making sure they don't result in
135 splitting ld/std pairs across a cache line. */
136
137 mtcrf 0x02,5
138 mtcrf 0x01,5
139 cmpldi cr5,12,1
140 beq L(das_loop)
141
142 bf 25,4f
143 .align 3
144 ld 6,0(4)
145 ld 7,8(4)
146 mr 11,4
147 mr 10,3
148 std 6,0(3)
149 std 7,8(3)
150 ld 6,16(4)
151 ld 7,24(4)
152 std 6,16(3)
153 std 7,24(3)
154 ld 6,0+32(4)
155 ld 7,8+32(4)
156 addi 4,4,64
157 addi 3,3,64
158 std 6,0+32(10)
159 std 7,8+32(10)
160 ld 6,16+32(11)
161 ld 7,24+32(11)
162 std 6,16+32(10)
163 std 7,24+32(10)
1644:
165 mr 10,3
166 bf 26,2f
167 ld 6,0(4)
168 ld 7,8(4)
169 mr 11,4
170 nop
171 std 6,0(3)
172 std 7,8(3)
173 ld 6,16(4)
174 ld 7,24(4)
175 addi 4,4,32
176 std 6,16(3)
177 std 7,24(3)
178 addi 3,3,32
1796:
180 nop
181 bf 27,5f
182 ld 6,0+32(11)
183 ld 7,8+32(11)
184 addi 4,4,16
185 addi 3,3,16
186 std 6,0+32(10)
187 std 7,8+32(10)
188 bf 28,L(das_loop_s)
189 ld 0,16+32(11)
190 addi 4,4,8
191 addi 3,3,8
192 std 0,16+32(10)
193 blt cr5,L(das_tail)
194 b L(das_loop)
195 .align 3
1965:
197 nop
198 bf 28,L(das_loop_s)
199 ld 6,32(11)
200 addi 4,4,8
201 addi 3,3,8
202 std 6,32(10)
203 blt cr5,L(das_tail)
204 b L(das_loop)
205 .align 3
2062:
207 mr 11,4
208 bf 27,1f
209 ld 6,0(4)
210 ld 7,8(4)
211 addi 4,4,16
212 addi 3,3,16
213 std 6,0(10)
214 std 7,8(10)
215 bf 28,L(das_loop_s)
216 ld 0,16(11)
217 addi 4,11,24
218 addi 3,10,24
219 std 0,16(10)
220 blt cr5,L(das_tail)
221 b L(das_loop)
222 .align 3
2231:
224 nop
225 bf 28,L(das_loop_s)
226 ld 6,0(4)
227 addi 4,4,8
228 addi 3,3,8
229 std 6,0(10)
230L(das_loop_s):
231 nop
232 blt cr5,L(das_tail)
233 .align 4
234L(das_loop):
235 ld 6,0(4)
236 ld 7,8(4)
237 mr 10,3
238 mr 11,4
239 std 6,0(3)
240 std 7,8(3)
241 addi 12,12,-1
242 nop
243 ld 8,16(4)
244 ld 0,24(4)
245 std 8,16(3)
246 std 0,24(3)
247
248 ld 6,0+32(4)
249 ld 7,8+32(4)
250 std 6,0+32(3)
251 std 7,8+32(3)
252 ld 8,16+32(4)
253 ld 0,24+32(4)
254 std 8,16+32(3)
255 std 0,24+32(3)
256
257 ld 6,0+64(11)
258 ld 7,8+64(11)
259 std 6,0+64(10)
260 std 7,8+64(10)
261 ld 8,16+64(11)
262 ld 0,24+64(11)
263 std 8,16+64(10)
264 std 0,24+64(10)
265
266 ld 6,0+96(11)
267 ld 7,8+96(11)
268 addi 4,4,128
269 addi 3,3,128
270 std 6,0+96(10)
271 std 7,8+96(10)
272 ld 8,16+96(11)
273 ld 0,24+96(11)
274 std 8,16+96(10)
275 std 0,24+96(10)
276 ble cr5,L(das_loop_e)
277
278 mtctr 12
279 .align 4
280L(das_loop2):
281 ld 6,0(4)
282 ld 7,8(4)
283 mr 10,3
284 mr 11,4
285 std 6,0(3)
286 std 7,8(3)
287 ld 8,16(4)
288 ld 0,24(4)
289 std 8,16(3)
290 std 0,24(3)
291
292 ld 6,0+32(4)
293 ld 7,8+32(4)
294 std 6,0+32(3)
295 std 7,8+32(3)
296 ld 8,16+32(4)
297 ld 0,24+32(4)
298 std 8,16+32(3)
299 std 0,24+32(3)
300
301 ld 6,0+64(11)
302 ld 7,8+64(11)
303 std 6,0+64(10)
304 std 7,8+64(10)
305 ld 8,16+64(11)
306 ld 0,24+64(11)
307 std 8,16+64(10)
308 std 0,24+64(10)
309
310 ld 6,0+96(11)
311 ld 7,8+96(11)
312 addi 4,4,128
313 addi 3,3,128
314 std 6,0+96(10)
315 std 7,8+96(10)
316 ld 8,16+96(11)
317 ld 0,24+96(11)
318 std 8,16+96(10)
319 std 0,24+96(10)
320 bdnz L(das_loop2)
321L(das_loop_e):
322/* Check of a 1-7 byte tail, return if none. */
323 bne cr1,L(das_tail2)
324/* Return original dst pointer. */
325 ld 3,-16(1)
326 blr
327 .align 4
328L(das_tail):
329 beq cr1,0f
330
331L(das_tail2):
332/* At this point we have a tail of 0-7 bytes and we know that the
333 destination is double word aligned. */
3344: bf 29,2f
335 lwz 6,0(4)
336 stw 6,0(3)
337 bf 30,5f
338 lhz 6,4(4)
339 sth 6,4(3)
340 bf 31,0f
341 lbz 6,6(4)
342 stb 6,6(3)
343 b 0f
3445: bf 31,0f
345 lbz 6,4(4)
346 stb 6,4(3)
347 b 0f
348
3492: bf 30,1f
350 lhz 6,0(4)
351 sth 6,0(3)
352 bf 31,0f
353 lbz 6,2(4)
354 stb 6,2(3)
355 b 0f
356
3571: bf 31,0f
358 lbz 6,0(4)
359 stb 6,0(3)
3600:
361 /* Return original dst pointer. */
362 ld 3,-16(1)
363 blr
364
365/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
366 bytes. Each case is handled without loops, using binary (1,2,4,8)
367 tests.
368
369 In the short (0-8 byte) case no attempt is made to force alignment
370 of either source or destination. The hardware will handle the
371 unaligned load/stores with small delays for crossing 32- 128-byte,
372 and 4096-byte boundaries. Since these short moves are unlikely to be
373 unaligned or cross these boundaries, the overhead to force
374 alignment is not justified.
375
376 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
377 boundaries. Since only loads are sensitive to the 32-/128-byte
378 boundaries it is more important to align the source then the
379 destination. If the source is not already word aligned, we first
380 move 1-3 bytes as needed. Since we are only word aligned we don't
381 use double word load/stores to insure that all loads are aligned.
382 While the destination and stores may still be unaligned, this
383 is only an issue for page (4096 byte boundary) crossing, which
384 should be rare for these short moves. The hardware handles this
385 case automatically with a small (~20 cycle) delay. */
386 .align 4
387.L2:
388 mtcrf 0x01,5
389 neg 8,4
390 clrrdi 11,4,2
391 andi. 0,8,3
392 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
393/* At least 9 bytes left. Get the source word aligned. */
394 cmpldi cr1,5,16
395 mr 10,5
396 mr 12,4
397 cmpldi cr6,0,2
398 beq L(dus_tail) /* If the source is already word aligned skip this. */
399/* Copy 1-3 bytes to get source address word aligned. */
400 lwz 6,0(11)
401 subf 10,0,5
402 add 12,4,0
403 blt cr6,5f
404 srdi 7,6,16
405 bgt cr6,3f
406#ifdef __LITTLE_ENDIAN__
407 sth 7,0(3)
408#else
409 sth 6,0(3)
410#endif
411 b 7f
412 .align 4
4133:
414#ifdef __LITTLE_ENDIAN__
415 rotlwi 6,6,24
416 stb 6,0(3)
417 sth 7,1(3)
418#else
419 stb 7,0(3)
420 sth 6,1(3)
421#endif
422 b 7f
423 .align 4
4245:
425#ifdef __LITTLE_ENDIAN__
426 rotlwi 6,6,8
427#endif
428 stb 6,0(3)
4297:
430 cmpldi cr1,10,16
431 add 3,3,0
432 mtcrf 0x01,10
433 .align 4
434L(dus_tail):
435/* At least 6 bytes left and the source is word aligned. This allows
436 some speculative loads up front. */
437/* We need to special case the fall-through because the biggest delays
438 are due to address computation not being ready in time for the
439 AGEN. */
440 lwz 6,0(12)
441 lwz 7,4(12)
442 blt cr1,L(dus_tail8)
443 cmpldi cr0,10,24
444L(dus_tail16): /* Move 16 bytes. */
445 stw 6,0(3)
446 stw 7,4(3)
447 lwz 6,8(12)
448 lwz 7,12(12)
449 stw 6,8(3)
450 stw 7,12(3)
451/* Move 8 bytes more. */
452 bf 28,L(dus_tail16p8)
453 cmpldi cr1,10,28
454 lwz 6,16(12)
455 lwz 7,20(12)
456 stw 6,16(3)
457 stw 7,20(3)
458/* Move 4 bytes more. */
459 bf 29,L(dus_tail16p4)
460 lwz 6,24(12)
461 stw 6,24(3)
462 addi 12,12,28
463 addi 3,3,28
464 bgt cr1,L(dus_tail2)
465 /* exactly 28 bytes. Return original dst pointer and exit. */
466 ld 3,-16(1)
467 blr
468 .align 4
469L(dus_tail16p8): /* less than 8 bytes left. */
470 beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */
471 cmpldi cr1,10,20
472 bf 29,L(dus_tail16p2)
473/* Move 4 bytes more. */
474 lwz 6,16(12)
475 stw 6,16(3)
476 addi 12,12,20
477 addi 3,3,20
478 bgt cr1,L(dus_tail2)
479 /* exactly 20 bytes. Return original dst pointer and exit. */
480 ld 3,-16(1)
481 blr
482 .align 4
483L(dus_tail16p4): /* less than 4 bytes left. */
484 addi 12,12,24
485 addi 3,3,24
486 bgt cr0,L(dus_tail2)
487 /* exactly 24 bytes. Return original dst pointer and exit. */
488 ld 3,-16(1)
489 blr
490 .align 4
491L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */
492 addi 12,12,16
493 addi 3,3,16
494 b L(dus_tail2)
495
496 .align 4
497L(dus_tail8): /* Move 8 bytes. */
498/* r6, r7 already loaded speculatively. */
499 cmpldi cr1,10,8
500 cmpldi cr0,10,12
501 bf 28,L(dus_tail4)
502 .align 2
503 stw 6,0(3)
504 stw 7,4(3)
505/* Move 4 bytes more. */
506 bf 29,L(dus_tail8p4)
507 lwz 6,8(12)
508 stw 6,8(3)
509 addi 12,12,12
510 addi 3,3,12
511 bgt cr0,L(dus_tail2)
512 /* exactly 12 bytes. Return original dst pointer and exit. */
513 ld 3,-16(1)
514 blr
515 .align 4
516L(dus_tail8p4): /* less than 4 bytes left. */
517 addi 12,12,8
518 addi 3,3,8
519 bgt cr1,L(dus_tail2)
520 /* exactly 8 bytes. Return original dst pointer and exit. */
521 ld 3,-16(1)
522 blr
523
524 .align 4
525L(dus_tail4): /* Move 4 bytes. */
526/* r6 already loaded speculatively. If we are here we know there is
527 more than 4 bytes left. So there is no need to test. */
528 addi 12,12,4
529 stw 6,0(3)
530 addi 3,3,4
531L(dus_tail2): /* Move 2-3 bytes. */
532 bf 30,L(dus_tail1)
533 lhz 6,0(12)
534 sth 6,0(3)
535 bf 31,L(dus_tailX)
536 lbz 7,2(12)
537 stb 7,2(3)
538 ld 3,-16(1)
539 blr
540L(dus_tail1): /* Move 1 byte. */
541 bf 31,L(dus_tailX)
542 lbz 6,0(12)
543 stb 6,0(3)
544L(dus_tailX):
545 /* Return original dst pointer. */
546 ld 3,-16(1)
547 blr
548
549/* Special case to copy 0-8 bytes. */
550 .align 4
551.LE8:
552 mr 12,4
553 bne cr6,L(dus_4)
554/* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20
555 cycle delay. This case should be rare and any attempt to avoid this
556 would take most of 20 cycles any way. */
557 ld 6,0(4)
558 std 6,0(3)
559 /* Return original dst pointer. */
560 ld 3,-16(1)
561 blr
562 .align 4
563L(dus_4):
564 bf 29,L(dus_tail2)
565 lwz 6,0(4)
566 stw 6,0(3)
567 bf 30,L(dus_5)
568 lhz 7,4(4)
569 sth 7,4(3)
570 bf 31,L(dus_0)
571 lbz 8,6(4)
572 stb 8,6(3)
573 ld 3,-16(1)
574 blr
575 .align 4
576L(dus_5):
577 bf 31,L(dus_0)
578 lbz 6,4(4)
579 stb 6,4(3)
580L(dus_0):
581 /* Return original dst pointer. */
582 ld 3,-16(1)
583 blr
584
585 .align 4
586.L6:
587 cfi_offset(31,-8)
588 mr 12,4
589 mr 31,5
590 /* Copy doublewords where the destination is aligned but the source is
591 not. Use aligned doubleword loads from the source, shifted to realign
592 the data, to allow aligned destination stores. */
593 addi 11,9,-1 /* loop DW count is one less than total */
594 subf 5,10,12 /* Move source addr to previous full double word. */
595 cmpldi cr5, 10, 2
596 cmpldi cr0, 10, 4
597 mr 4,3
598 srdi 8,11,2 /* calculate the 32 byte loop count */
599 ld 6,0(5) /* pre load 1st full doubleword. */
600 mtcrf 0x01,11
601 cmpldi cr6,9,4
602 mtctr 8
603 ld 7,8(5) /* pre load 2nd full doubleword. */
604 bge cr0, L(du4_do)
605 blt cr5, L(du1_do)
606 beq cr5, L(du2_do)
607 b L(du3_do)
608
609 .align 4
610L(du1_do):
611 bf 30,L(du1_1dw)
612
613 /* there are at least two DWs to copy */
614 /* FIXME: can combine last shift and "or" into "rldimi" */
615#ifdef __LITTLE_ENDIAN__
616 srdi 0,6, 8
617 sldi 8,7, 64-8
618#else
619 sldi 0,6, 8
620 srdi 8,7, 64-8
621#endif
622 or 0,0,8
623 ld 6,16(5)
624 std 0,0(4)
625#ifdef __LITTLE_ENDIAN__
626 srdi 0,7, 8
627 sldi 8,6, 64-8
628#else
629 sldi 0,7, 8
630 srdi 8,6, 64-8
631#endif
632 or 0,0,8
633 ld 7,24(5)
634 std 0,8(4)
635 addi 4,4,16
636 addi 5,5,32
637 blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */
638 bf 31,L(du1_loop)
639 /* there is a third DW to copy */
640#ifdef __LITTLE_ENDIAN__
641 srdi 0,6, 8
642 sldi 8,7, 64-8
643#else
644 sldi 0,6, 8
645 srdi 8,7, 64-8
646#endif
647 or 0,0,8
648 std 0,0(4)
649 mr 6,7
650 ld 7,0(5)
651 addi 5,5,8
652 addi 4,4,8
653 beq cr6,L(du1_fini) /* if total DWs = 4, then bypass loop */
654 b L(du1_loop)
655 .align 4
656L(du1_1dw):
657#ifdef __LITTLE_ENDIAN__
658 srdi 0,6, 8
659 sldi 8,7, 64-8
660#else
661 sldi 0,6, 8
662 srdi 8,7, 64-8
663#endif
664 addi 5,5,16
665 or 0,0,8
666 bf 31,L(du1_loop)
667 mr 6,7
668 ld 7,0(5)
669 addi 5,5,8
670 std 0,0(4)
671 addi 4,4,8
672 .align 4
673/* copy 32 bytes at a time */
674L(du1_loop):
675#ifdef __LITTLE_ENDIAN__
676 srdi 0,6, 8
677 sldi 8,7, 64-8
678#else
679 sldi 0,6, 8
680 srdi 8,7, 64-8
681#endif
682 or 0,0,8
683 ld 6,0(5)
684 std 0,0(4)
685#ifdef __LITTLE_ENDIAN__
686 srdi 0,7, 8
687 sldi 8,6, 64-8
688#else
689 sldi 0,7, 8
690 srdi 8,6, 64-8
691#endif
692 or 0,0,8
693 ld 7,8(5)
694 std 0,8(4)
695#ifdef __LITTLE_ENDIAN__
696 srdi 0,6, 8
697 sldi 8,7, 64-8
698#else
699 sldi 0,6, 8
700 srdi 8,7, 64-8
701#endif
702 or 0,0,8
703 ld 6,16(5)
704 std 0,16(4)
705#ifdef __LITTLE_ENDIAN__
706 srdi 0,7, 8
707 sldi 8,6, 64-8
708#else
709 sldi 0,7, 8
710 srdi 8,6, 64-8
711#endif
712 or 0,0,8
713 ld 7,24(5)
714 std 0,24(4)
715 addi 5,5,32
716 addi 4,4,32
717 bdnz+ L(du1_loop)
718 .align 4
719L(du1_fini):
720 /* calculate and store the final DW */
721#ifdef __LITTLE_ENDIAN__
722 srdi 0,6, 8
723 sldi 8,7, 64-8
724#else
725 sldi 0,6, 8
726 srdi 8,7, 64-8
727#endif
728 or 0,0,8
729 std 0,0(4)
730 b L(du_done)
731
732 .align 4
733L(du2_do):
734 bf 30,L(du2_1dw)
735
736 /* there are at least two DWs to copy */
737#ifdef __LITTLE_ENDIAN__
738 srdi 0,6, 16
739 sldi 8,7, 64-16
740#else
741 sldi 0,6, 16
742 srdi 8,7, 64-16
743#endif
744 or 0,0,8
745 ld 6,16(5)
746 std 0,0(4)
747#ifdef __LITTLE_ENDIAN__
748 srdi 0,7, 16
749 sldi 8,6, 64-16
750#else
751 sldi 0,7, 16
752 srdi 8,6, 64-16
753#endif
754 or 0,0,8
755 ld 7,24(5)
756 std 0,8(4)
757 addi 4,4,16
758 addi 5,5,32
759 blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */
760 bf 31,L(du2_loop)
761 /* there is a third DW to copy */
762#ifdef __LITTLE_ENDIAN__
763 srdi 0,6, 16
764 sldi 8,7, 64-16
765#else
766 sldi 0,6, 16
767 srdi 8,7, 64-16
768#endif
769 or 0,0,8
770 std 0,0(4)
771 mr 6,7
772 ld 7,0(5)
773 addi 5,5,8
774 addi 4,4,8
775 beq cr6,L(du2_fini) /* if total DWs = 4, then bypass loop */
776 b L(du2_loop)
777 .align 4
778L(du2_1dw):
779#ifdef __LITTLE_ENDIAN__
780 srdi 0,6, 16
781 sldi 8,7, 64-16
782#else
783 sldi 0,6, 16
784 srdi 8,7, 64-16
785#endif
786 addi 5,5,16
787 or 0,0,8
788 bf 31,L(du2_loop)
789 mr 6,7
790 ld 7,0(5)
791 addi 5,5,8
792 std 0,0(4)
793 addi 4,4,8
794 .align 4
795/* copy 32 bytes at a time */
796L(du2_loop):
797#ifdef __LITTLE_ENDIAN__
798 srdi 0,6, 16
799 sldi 8,7, 64-16
800#else
801 sldi 0,6, 16
802 srdi 8,7, 64-16
803#endif
804 or 0,0,8
805 ld 6,0(5)
806 std 0,0(4)
807#ifdef __LITTLE_ENDIAN__
808 srdi 0,7, 16
809 sldi 8,6, 64-16
810#else
811 sldi 0,7, 16
812 srdi 8,6, 64-16
813#endif
814 or 0,0,8
815 ld 7,8(5)
816 std 0,8(4)
817#ifdef __LITTLE_ENDIAN__
818 srdi 0,6, 16
819 sldi 8,7, 64-16
820#else
821 sldi 0,6, 16
822 srdi 8,7, 64-16
823#endif
824 or 0,0,8
825 ld 6,16(5)
826 std 0,16(4)
827#ifdef __LITTLE_ENDIAN__
828 srdi 0,7, 16
829 sldi 8,6, 64-16
830#else
831 sldi 0,7, 16
832 srdi 8,6, 64-16
833#endif
834 or 0,0,8
835 ld 7,24(5)
836 std 0,24(4)
837 addi 5,5,32
838 addi 4,4,32
839 bdnz+ L(du2_loop)
840 .align 4
841L(du2_fini):
842 /* calculate and store the final DW */
843#ifdef __LITTLE_ENDIAN__
844 srdi 0,6, 16
845 sldi 8,7, 64-16
846#else
847 sldi 0,6, 16
848 srdi 8,7, 64-16
849#endif
850 or 0,0,8
851 std 0,0(4)
852 b L(du_done)
853
854 .align 4
855L(du3_do):
856 bf 30,L(du3_1dw)
857
858 /* there are at least two DWs to copy */
859#ifdef __LITTLE_ENDIAN__
860 srdi 0,6, 24
861 sldi 8,7, 64-24
862#else
863 sldi 0,6, 24
864 srdi 8,7, 64-24
865#endif
866 or 0,0,8
867 ld 6,16(5)
868 std 0,0(4)
869#ifdef __LITTLE_ENDIAN__
870 srdi 0,7, 24
871 sldi 8,6, 64-24
872#else
873 sldi 0,7, 24
874 srdi 8,6, 64-24
875#endif
876 or 0,0,8
877 ld 7,24(5)
878 std 0,8(4)
879 addi 4,4,16
880 addi 5,5,32
881 blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */
882 bf 31,L(du3_loop)
883 /* there is a third DW to copy */
884#ifdef __LITTLE_ENDIAN__
885 srdi 0,6, 24
886 sldi 8,7, 64-24
887#else
888 sldi 0,6, 24
889 srdi 8,7, 64-24
890#endif
891 or 0,0,8
892 std 0,0(4)
893 mr 6,7
894 ld 7,0(5)
895 addi 5,5,8
896 addi 4,4,8
897 beq cr6,L(du3_fini) /* if total DWs = 4, then bypass loop */
898 b L(du3_loop)
899 .align 4
900L(du3_1dw):
901#ifdef __LITTLE_ENDIAN__
902 srdi 0,6, 24
903 sldi 8,7, 64-24
904#else
905 sldi 0,6, 24
906 srdi 8,7, 64-24
907#endif
908 addi 5,5,16
909 or 0,0,8
910 bf 31,L(du3_loop)
911 mr 6,7
912 ld 7,0(5)
913 addi 5,5,8
914 std 0,0(4)
915 addi 4,4,8
916 .align 4
917/* copy 32 bytes at a time */
918L(du3_loop):
919#ifdef __LITTLE_ENDIAN__
920 srdi 0,6, 24
921 sldi 8,7, 64-24
922#else
923 sldi 0,6, 24
924 srdi 8,7, 64-24
925#endif
926 or 0,0,8
927 ld 6,0(5)
928 std 0,0(4)
929#ifdef __LITTLE_ENDIAN__
930 srdi 0,7, 24
931 sldi 8,6, 64-24
932#else
933 sldi 0,7, 24
934 srdi 8,6, 64-24
935#endif
936 or 0,0,8
937 ld 7,8(5)
938 std 0,8(4)
939#ifdef __LITTLE_ENDIAN__
940 srdi 0,6, 24
941 sldi 8,7, 64-24
942#else
943 sldi 0,6, 24
944 srdi 8,7, 64-24
945#endif
946 or 0,0,8
947 ld 6,16(5)
948 std 0,16(4)
949#ifdef __LITTLE_ENDIAN__
950 srdi 0,7, 24
951 sldi 8,6, 64-24
952#else
953 sldi 0,7, 24
954 srdi 8,6, 64-24
955#endif
956 or 0,0,8
957 ld 7,24(5)
958 std 0,24(4)
959 addi 5,5,32
960 addi 4,4,32
961 bdnz+ L(du3_loop)
962 .align 4
963L(du3_fini):
964 /* calculate and store the final DW */
965#ifdef __LITTLE_ENDIAN__
966 srdi 0,6, 24
967 sldi 8,7, 64-24
968#else
969 sldi 0,6, 24
970 srdi 8,7, 64-24
971#endif
972 or 0,0,8
973 std 0,0(4)
974 b L(du_done)
975
976 .align 4
977L(du4_do):
978 cmpldi cr5, 10, 6
979 beq cr0, L(du4_dox)
980 blt cr5, L(du5_do)
981 beq cr5, L(du6_do)
982 b L(du7_do)
983L(du4_dox):
984 bf 30,L(du4_1dw)
985
986 /* there are at least two DWs to copy */
987#ifdef __LITTLE_ENDIAN__
988 srdi 0,6, 32
989 sldi 8,7, 64-32
990#else
991 sldi 0,6, 32
992 srdi 8,7, 64-32
993#endif
994 or 0,0,8
995 ld 6,16(5)
996 std 0,0(4)
997#ifdef __LITTLE_ENDIAN__
998 srdi 0,7, 32
999 sldi 8,6, 64-32
1000#else
1001 sldi 0,7, 32
1002 srdi 8,6, 64-32
1003#endif
1004 or 0,0,8
1005 ld 7,24(5)
1006 std 0,8(4)
1007 addi 4,4,16
1008 addi 5,5,32
1009 blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */
1010 bf 31,L(du4_loop)
1011 /* there is a third DW to copy */
1012#ifdef __LITTLE_ENDIAN__
1013 srdi 0,6, 32
1014 sldi 8,7, 64-32
1015#else
1016 sldi 0,6, 32
1017 srdi 8,7, 64-32
1018#endif
1019 or 0,0,8
1020 std 0,0(4)
1021 mr 6,7
1022 ld 7,0(5)
1023 addi 5,5,8
1024 addi 4,4,8
1025 beq cr6,L(du4_fini) /* if total DWs = 4, then bypass loop */
1026 b L(du4_loop)
1027 .align 4
1028L(du4_1dw):
1029#ifdef __LITTLE_ENDIAN__
1030 srdi 0,6, 32
1031 sldi 8,7, 64-32
1032#else
1033 sldi 0,6, 32
1034 srdi 8,7, 64-32
1035#endif
1036 addi 5,5,16
1037 or 0,0,8
1038 bf 31,L(du4_loop)
1039 mr 6,7
1040 ld 7,0(5)
1041 addi 5,5,8
1042 std 0,0(4)
1043 addi 4,4,8
1044 .align 4
1045/* copy 32 bytes at a time */
1046L(du4_loop):
1047#ifdef __LITTLE_ENDIAN__
1048 srdi 0,6, 32
1049 sldi 8,7, 64-32
1050#else
1051 sldi 0,6, 32
1052 srdi 8,7, 64-32
1053#endif
1054 or 0,0,8
1055 ld 6,0(5)
1056 std 0,0(4)
1057#ifdef __LITTLE_ENDIAN__
1058 srdi 0,7, 32
1059 sldi 8,6, 64-32
1060#else
1061 sldi 0,7, 32
1062 srdi 8,6, 64-32
1063#endif
1064 or 0,0,8
1065 ld 7,8(5)
1066 std 0,8(4)
1067#ifdef __LITTLE_ENDIAN__
1068 srdi 0,6, 32
1069 sldi 8,7, 64-32
1070#else
1071 sldi 0,6, 32
1072 srdi 8,7, 64-32
1073#endif
1074 or 0,0,8
1075 ld 6,16(5)
1076 std 0,16(4)
1077#ifdef __LITTLE_ENDIAN__
1078 srdi 0,7, 32
1079 sldi 8,6, 64-32
1080#else
1081 sldi 0,7, 32
1082 srdi 8,6, 64-32
1083#endif
1084 or 0,0,8
1085 ld 7,24(5)
1086 std 0,24(4)
1087 addi 5,5,32
1088 addi 4,4,32
1089 bdnz+ L(du4_loop)
1090 .align 4
1091L(du4_fini):
1092 /* calculate and store the final DW */
1093#ifdef __LITTLE_ENDIAN__
1094 srdi 0,6, 32
1095 sldi 8,7, 64-32
1096#else
1097 sldi 0,6, 32
1098 srdi 8,7, 64-32
1099#endif
1100 or 0,0,8
1101 std 0,0(4)
1102 b L(du_done)
1103
1104 .align 4
1105L(du5_do):
1106 bf 30,L(du5_1dw)
1107
1108 /* there are at least two DWs to copy */
1109#ifdef __LITTLE_ENDIAN__
1110 srdi 0,6, 40
1111 sldi 8,7, 64-40
1112#else
1113 sldi 0,6, 40
1114 srdi 8,7, 64-40
1115#endif
1116 or 0,0,8
1117 ld 6,16(5)
1118 std 0,0(4)
1119#ifdef __LITTLE_ENDIAN__
1120 srdi 0,7, 40
1121 sldi 8,6, 64-40
1122#else
1123 sldi 0,7, 40
1124 srdi 8,6, 64-40
1125#endif
1126 or 0,0,8
1127 ld 7,24(5)
1128 std 0,8(4)
1129 addi 4,4,16
1130 addi 5,5,32
1131 blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */
1132 bf 31,L(du5_loop)
1133 /* there is a third DW to copy */
1134#ifdef __LITTLE_ENDIAN__
1135 srdi 0,6, 40
1136 sldi 8,7, 64-40
1137#else
1138 sldi 0,6, 40
1139 srdi 8,7, 64-40
1140#endif
1141 or 0,0,8
1142 std 0,0(4)
1143 mr 6,7
1144 ld 7,0(5)
1145 addi 5,5,8
1146 addi 4,4,8
1147 beq cr6,L(du5_fini) /* if total DWs = 4, then bypass loop */
1148 b L(du5_loop)
1149 .align 4
1150L(du5_1dw):
1151#ifdef __LITTLE_ENDIAN__
1152 srdi 0,6, 40
1153 sldi 8,7, 64-40
1154#else
1155 sldi 0,6, 40
1156 srdi 8,7, 64-40
1157#endif
1158 addi 5,5,16
1159 or 0,0,8
1160 bf 31,L(du5_loop)
1161 mr 6,7
1162 ld 7,0(5)
1163 addi 5,5,8
1164 std 0,0(4)
1165 addi 4,4,8
1166 .align 4
1167/* copy 32 bytes at a time */
1168L(du5_loop):
1169#ifdef __LITTLE_ENDIAN__
1170 srdi 0,6, 40
1171 sldi 8,7, 64-40
1172#else
1173 sldi 0,6, 40
1174 srdi 8,7, 64-40
1175#endif
1176 or 0,0,8
1177 ld 6,0(5)
1178 std 0,0(4)
1179#ifdef __LITTLE_ENDIAN__
1180 srdi 0,7, 40
1181 sldi 8,6, 64-40
1182#else
1183 sldi 0,7, 40
1184 srdi 8,6, 64-40
1185#endif
1186 or 0,0,8
1187 ld 7,8(5)
1188 std 0,8(4)
1189#ifdef __LITTLE_ENDIAN__
1190 srdi 0,6, 40
1191 sldi 8,7, 64-40
1192#else
1193 sldi 0,6, 40
1194 srdi 8,7, 64-40
1195#endif
1196 or 0,0,8
1197 ld 6,16(5)
1198 std 0,16(4)
1199#ifdef __LITTLE_ENDIAN__
1200 srdi 0,7, 40
1201 sldi 8,6, 64-40
1202#else
1203 sldi 0,7, 40
1204 srdi 8,6, 64-40
1205#endif
1206 or 0,0,8
1207 ld 7,24(5)
1208 std 0,24(4)
1209 addi 5,5,32
1210 addi 4,4,32
1211 bdnz+ L(du5_loop)
1212 .align 4
1213L(du5_fini):
1214 /* calculate and store the final DW */
1215#ifdef __LITTLE_ENDIAN__
1216 srdi 0,6, 40
1217 sldi 8,7, 64-40
1218#else
1219 sldi 0,6, 40
1220 srdi 8,7, 64-40
1221#endif
1222 or 0,0,8
1223 std 0,0(4)
1224 b L(du_done)
1225
1226 .align 4
1227L(du6_do):
1228 bf 30,L(du6_1dw)
1229
1230 /* there are at least two DWs to copy */
1231#ifdef __LITTLE_ENDIAN__
1232 srdi 0,6, 48
1233 sldi 8,7, 64-48
1234#else
1235 sldi 0,6, 48
1236 srdi 8,7, 64-48
1237#endif
1238 or 0,0,8
1239 ld 6,16(5)
1240 std 0,0(4)
1241#ifdef __LITTLE_ENDIAN__
1242 srdi 0,7, 48
1243 sldi 8,6, 64-48
1244#else
1245 sldi 0,7, 48
1246 srdi 8,6, 64-48
1247#endif
1248 or 0,0,8
1249 ld 7,24(5)
1250 std 0,8(4)
1251 addi 4,4,16
1252 addi 5,5,32
1253 blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */
1254 bf 31,L(du6_loop)
1255 /* there is a third DW to copy */
1256#ifdef __LITTLE_ENDIAN__
1257 srdi 0,6, 48
1258 sldi 8,7, 64-48
1259#else
1260 sldi 0,6, 48
1261 srdi 8,7, 64-48
1262#endif
1263 or 0,0,8
1264 std 0,0(4)
1265 mr 6,7
1266 ld 7,0(5)
1267 addi 5,5,8
1268 addi 4,4,8
1269 beq cr6,L(du6_fini) /* if total DWs = 4, then bypass loop */
1270 b L(du6_loop)
1271 .align 4
1272L(du6_1dw):
1273#ifdef __LITTLE_ENDIAN__
1274 srdi 0,6, 48
1275 sldi 8,7, 64-48
1276#else
1277 sldi 0,6, 48
1278 srdi 8,7, 64-48
1279#endif
1280 addi 5,5,16
1281 or 0,0,8
1282 bf 31,L(du6_loop)
1283 mr 6,7
1284 ld 7,0(5)
1285 addi 5,5,8
1286 std 0,0(4)
1287 addi 4,4,8
1288 .align 4
1289/* copy 32 bytes at a time */
1290L(du6_loop):
1291#ifdef __LITTLE_ENDIAN__
1292 srdi 0,6, 48
1293 sldi 8,7, 64-48
1294#else
1295 sldi 0,6, 48
1296 srdi 8,7, 64-48
1297#endif
1298 or 0,0,8
1299 ld 6,0(5)
1300 std 0,0(4)
1301#ifdef __LITTLE_ENDIAN__
1302 srdi 0,7, 48
1303 sldi 8,6, 64-48
1304#else
1305 sldi 0,7, 48
1306 srdi 8,6, 64-48
1307#endif
1308 or 0,0,8
1309 ld 7,8(5)
1310 std 0,8(4)
1311#ifdef __LITTLE_ENDIAN__
1312 srdi 0,6, 48
1313 sldi 8,7, 64-48
1314#else
1315 sldi 0,6, 48
1316 srdi 8,7, 64-48
1317#endif
1318 or 0,0,8
1319 ld 6,16(5)
1320 std 0,16(4)
1321#ifdef __LITTLE_ENDIAN__
1322 srdi 0,7, 48
1323 sldi 8,6, 64-48
1324#else
1325 sldi 0,7, 48
1326 srdi 8,6, 64-48
1327#endif
1328 or 0,0,8
1329 ld 7,24(5)
1330 std 0,24(4)
1331 addi 5,5,32
1332 addi 4,4,32
1333 bdnz+ L(du6_loop)
1334 .align 4
1335L(du6_fini):
1336 /* calculate and store the final DW */
1337#ifdef __LITTLE_ENDIAN__
1338 srdi 0,6, 48
1339 sldi 8,7, 64-48
1340#else
1341 sldi 0,6, 48
1342 srdi 8,7, 64-48
1343#endif
1344 or 0,0,8
1345 std 0,0(4)
1346 b L(du_done)
1347
1348 .align 4
1349L(du7_do):
1350 bf 30,L(du7_1dw)
1351
1352 /* there are at least two DWs to copy */
1353#ifdef __LITTLE_ENDIAN__
1354 srdi 0,6, 56
1355 sldi 8,7, 64-56
1356#else
1357 sldi 0,6, 56
1358 srdi 8,7, 64-56
1359#endif
1360 or 0,0,8
1361 ld 6,16(5)
1362 std 0,0(4)
1363#ifdef __LITTLE_ENDIAN__
1364 srdi 0,7, 56
1365 sldi 8,6, 64-56
1366#else
1367 sldi 0,7, 56
1368 srdi 8,6, 64-56
1369#endif
1370 or 0,0,8
1371 ld 7,24(5)
1372 std 0,8(4)
1373 addi 4,4,16
1374 addi 5,5,32
1375 blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */
1376 bf 31,L(du7_loop)
1377 /* there is a third DW to copy */
1378#ifdef __LITTLE_ENDIAN__
1379 srdi 0,6, 56
1380 sldi 8,7, 64-56
1381#else
1382 sldi 0,6, 56
1383 srdi 8,7, 64-56
1384#endif
1385 or 0,0,8
1386 std 0,0(4)
1387 mr 6,7
1388 ld 7,0(5)
1389 addi 5,5,8
1390 addi 4,4,8
1391 beq cr6,L(du7_fini) /* if total DWs = 4, then bypass loop */
1392 b L(du7_loop)
1393 .align 4
1394L(du7_1dw):
1395#ifdef __LITTLE_ENDIAN__
1396 srdi 0,6, 56
1397 sldi 8,7, 64-56
1398#else
1399 sldi 0,6, 56
1400 srdi 8,7, 64-56
1401#endif
1402 addi 5,5,16
1403 or 0,0,8
1404 bf 31,L(du7_loop)
1405 mr 6,7
1406 ld 7,0(5)
1407 addi 5,5,8
1408 std 0,0(4)
1409 addi 4,4,8
1410 .align 4
1411/* copy 32 bytes at a time */
1412L(du7_loop):
1413#ifdef __LITTLE_ENDIAN__
1414 srdi 0,6, 56
1415 sldi 8,7, 64-56
1416#else
1417 sldi 0,6, 56
1418 srdi 8,7, 64-56
1419#endif
1420 or 0,0,8
1421 ld 6,0(5)
1422 std 0,0(4)
1423#ifdef __LITTLE_ENDIAN__
1424 srdi 0,7, 56
1425 sldi 8,6, 64-56
1426#else
1427 sldi 0,7, 56
1428 srdi 8,6, 64-56
1429#endif
1430 or 0,0,8
1431 ld 7,8(5)
1432 std 0,8(4)
1433#ifdef __LITTLE_ENDIAN__
1434 srdi 0,6, 56
1435 sldi 8,7, 64-56
1436#else
1437 sldi 0,6, 56
1438 srdi 8,7, 64-56
1439#endif
1440 or 0,0,8
1441 ld 6,16(5)
1442 std 0,16(4)
1443#ifdef __LITTLE_ENDIAN__
1444 srdi 0,7, 56
1445 sldi 8,6, 64-56
1446#else
1447 sldi 0,7, 56
1448 srdi 8,6, 64-56
1449#endif
1450 or 0,0,8
1451 ld 7,24(5)
1452 std 0,24(4)
1453 addi 5,5,32
1454 addi 4,4,32
1455 bdnz+ L(du7_loop)
1456 .align 4
1457L(du7_fini):
1458 /* calculate and store the final DW */
1459#ifdef __LITTLE_ENDIAN__
1460 srdi 0,6, 56
1461 sldi 8,7, 64-56
1462#else
1463 sldi 0,6, 56
1464 srdi 8,7, 64-56
1465#endif
1466 or 0,0,8
1467 std 0,0(4)
1468 b L(du_done)
1469
1470 .align 4
1471L(du_done):
1472 rldicr 0,31,0,60
1473 mtcrf 0x01,31
1474 beq cr1,0f /* If the tail is 0 bytes we are done! */
1475
1476 add 3,3,0
1477 add 12,12,0
1478/* At this point we have a tail of 0-7 bytes and we know that the
1479 destination is double word aligned. */
14804: bf 29,2f
1481 lwz 6,0(12)
1482 addi 12,12,4
1483 stw 6,0(3)
1484 addi 3,3,4
14852: bf 30,1f
1486 lhz 6,0(12)
1487 addi 12,12,2
1488 sth 6,0(3)
1489 addi 3,3,2
14901: bf 31,0f
1491 lbz 6,0(12)
1492 stb 6,0(3)
14930:
1494 /* Return original dst pointer. */
1495 ld 31,-8(1)
1496 ld 3,-16(1)
1497 blr
1498END_GEN_TB (MEMCPY,TB_TOCLESS)
1499libc_hidden_builtin_def (memcpy)
1500

source code of glibc/sysdeps/powerpc/powerpc64/power6/memcpy.S