1 | /* Optimized memcpy implementation for PowerPC64. |
2 | Copyright (C) 2003-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); |
22 | Returns 'dst'. |
23 | |
24 | Memcpy handles short copies (< 32-bytes) using a binary move blocks |
25 | (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled |
26 | with the appropriate combination of byte and halfword load/stores. |
27 | There is minimal effort to optimize the alignment of short moves. |
28 | The 64-bit implementations of POWER3 and POWER4 do a reasonable job |
29 | of handling unaligned load/stores that do not cross 32-byte boundaries. |
30 | |
31 | Longer moves (>= 32-bytes) justify the effort to get at least the |
32 | destination doubleword (8-byte) aligned. Further optimization is |
33 | possible when both source and destination are doubleword aligned. |
34 | Each case has a optimized unrolled loop. |
35 | |
36 | For POWER6 unaligned loads will take a 20+ cycle hiccup for any |
37 | L1 cache miss that crosses a 32- or 128-byte boundary. Store |
38 | is more forgiving and does not take a hiccup until page or |
39 | segment boundaries. So we require doubleword alignment for |
40 | the source but may take a risk and only require word alignment |
41 | for the destination. */ |
42 | |
43 | #ifndef MEMCPY |
44 | # define MEMCPY memcpy |
45 | #endif |
46 | .machine "power6" |
47 | ENTRY_TOCLESS (MEMCPY, 7) |
48 | CALL_MCOUNT 3 |
49 | |
50 | cmpldi cr1,5,31 |
51 | neg 0,3 |
52 | std 3,-16(1) |
53 | std 31,-8(1) |
54 | andi. 11,3,7 /* check alignment of dst. */ |
55 | clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */ |
56 | clrldi 10,4,61 /* check alignment of src. */ |
57 | cmpldi cr6,5,8 |
58 | ble- cr1,.L2 /* If move < 32 bytes use short move code. */ |
59 | mtcrf 0x01,0 |
60 | cmpld cr6,10,11 |
61 | srdi 9,5,3 /* Number of full double words remaining. */ |
62 | beq .L0 |
63 | |
64 | subf 5,0,5 |
65 | /* Move 0-7 bytes as needed to get the destination doubleword aligned. |
66 | Duplicate some code to maximize fall-through and minimize agen delays. */ |
67 | 1: bf 31,2f |
68 | lbz 6,0(4) |
69 | stb 6,0(3) |
70 | bf 30,5f |
71 | lhz 6,1(4) |
72 | sth 6,1(3) |
73 | bf 29,0f |
74 | lwz 6,3(4) |
75 | stw 6,3(3) |
76 | b 0f |
77 | 5: |
78 | bf 29,0f |
79 | lwz 6,1(4) |
80 | stw 6,1(3) |
81 | b 0f |
82 | |
83 | 2: bf 30,4f |
84 | lhz 6,0(4) |
85 | sth 6,0(3) |
86 | bf 29,0f |
87 | lwz 6,2(4) |
88 | stw 6,2(3) |
89 | b 0f |
90 | |
91 | 4: bf 29,0f |
92 | lwz 6,0(4) |
93 | stw 6,0(3) |
94 | 0: |
95 | /* Add the number of bytes until the 1st doubleword of dst to src and dst. */ |
96 | add 4,4,0 |
97 | add 3,3,0 |
98 | |
99 | clrldi 10,4,61 /* check alignment of src again. */ |
100 | srdi 9,5,3 /* Number of full double words remaining. */ |
101 | |
102 | /* Copy doublewords from source to destination, assuming the |
103 | destination is aligned on a doubleword boundary. |
104 | |
105 | At this point we know there are at least 25 bytes left (32-7) to copy. |
106 | The next step is to determine if the source is also doubleword aligned. |
107 | If not branch to the unaligned move code at .L6. which uses |
108 | a load, shift, store strategy. |
109 | |
110 | Otherwise source and destination are doubleword aligned, and we can |
111 | the optimized doubleword copy loop. */ |
112 | .align 4 |
113 | .L0: |
114 | clrldi 11,5,61 |
115 | andi. 0,5,0x78 |
116 | srdi 12,5,7 /* Number of 128-byte blocks to move. */ |
117 | cmpldi cr1,11,0 /* If the tail is 0 bytes */ |
118 | bne- cr6,.L6 /* If source is not DW aligned. */ |
119 | |
120 | /* Move doublewords where destination and source are DW aligned. |
121 | Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration. |
122 | If the copy is not an exact multiple of 128 bytes, 1-15 |
123 | doublewords are copied as needed to set up the main loop. After |
124 | the main loop exits there may be a tail of 1-7 bytes. These byte |
125 | are copied a word/halfword/byte at a time as needed to preserve |
126 | alignment. |
127 | |
128 | For POWER6 the L1 is store-through and the L2 is store-in. The |
129 | L2 is clocked at half CPU clock so we can store 16 bytes every |
130 | other cycle. POWER6 also has a load/store bypass so we can do |
131 | load, load, store, store every 2 cycles. |
132 | |
133 | The following code is sensitive to cache line alignment. Do not |
134 | make any change with out first making sure they don't result in |
135 | splitting ld/std pairs across a cache line. */ |
136 | |
137 | mtcrf 0x02,5 |
138 | mtcrf 0x01,5 |
139 | cmpldi cr5,12,1 |
140 | beq L(das_loop) |
141 | |
142 | bf 25,4f |
143 | .align 3 |
144 | ld 6,0(4) |
145 | ld 7,8(4) |
146 | mr 11,4 |
147 | mr 10,3 |
148 | std 6,0(3) |
149 | std 7,8(3) |
150 | ld 6,16(4) |
151 | ld 7,24(4) |
152 | std 6,16(3) |
153 | std 7,24(3) |
154 | ld 6,0+32(4) |
155 | ld 7,8+32(4) |
156 | addi 4,4,64 |
157 | addi 3,3,64 |
158 | std 6,0+32(10) |
159 | std 7,8+32(10) |
160 | ld 6,16+32(11) |
161 | ld 7,24+32(11) |
162 | std 6,16+32(10) |
163 | std 7,24+32(10) |
164 | 4: |
165 | mr 10,3 |
166 | bf 26,2f |
167 | ld 6,0(4) |
168 | ld 7,8(4) |
169 | mr 11,4 |
170 | nop |
171 | std 6,0(3) |
172 | std 7,8(3) |
173 | ld 6,16(4) |
174 | ld 7,24(4) |
175 | addi 4,4,32 |
176 | std 6,16(3) |
177 | std 7,24(3) |
178 | addi 3,3,32 |
179 | 6: |
180 | nop |
181 | bf 27,5f |
182 | ld 6,0+32(11) |
183 | ld 7,8+32(11) |
184 | addi 4,4,16 |
185 | addi 3,3,16 |
186 | std 6,0+32(10) |
187 | std 7,8+32(10) |
188 | bf 28,L(das_loop_s) |
189 | ld 0,16+32(11) |
190 | addi 4,4,8 |
191 | addi 3,3,8 |
192 | std 0,16+32(10) |
193 | blt cr5,L(das_tail) |
194 | b L(das_loop) |
195 | .align 3 |
196 | 5: |
197 | nop |
198 | bf 28,L(das_loop_s) |
199 | ld 6,32(11) |
200 | addi 4,4,8 |
201 | addi 3,3,8 |
202 | std 6,32(10) |
203 | blt cr5,L(das_tail) |
204 | b L(das_loop) |
205 | .align 3 |
206 | 2: |
207 | mr 11,4 |
208 | bf 27,1f |
209 | ld 6,0(4) |
210 | ld 7,8(4) |
211 | addi 4,4,16 |
212 | addi 3,3,16 |
213 | std 6,0(10) |
214 | std 7,8(10) |
215 | bf 28,L(das_loop_s) |
216 | ld 0,16(11) |
217 | addi 4,11,24 |
218 | addi 3,10,24 |
219 | std 0,16(10) |
220 | blt cr5,L(das_tail) |
221 | b L(das_loop) |
222 | .align 3 |
223 | 1: |
224 | nop |
225 | bf 28,L(das_loop_s) |
226 | ld 6,0(4) |
227 | addi 4,4,8 |
228 | addi 3,3,8 |
229 | std 6,0(10) |
230 | L(das_loop_s): |
231 | nop |
232 | blt cr5,L(das_tail) |
233 | .align 4 |
234 | L(das_loop): |
235 | ld 6,0(4) |
236 | ld 7,8(4) |
237 | mr 10,3 |
238 | mr 11,4 |
239 | std 6,0(3) |
240 | std 7,8(3) |
241 | addi 12,12,-1 |
242 | nop |
243 | ld 8,16(4) |
244 | ld 0,24(4) |
245 | std 8,16(3) |
246 | std 0,24(3) |
247 | |
248 | ld 6,0+32(4) |
249 | ld 7,8+32(4) |
250 | std 6,0+32(3) |
251 | std 7,8+32(3) |
252 | ld 8,16+32(4) |
253 | ld 0,24+32(4) |
254 | std 8,16+32(3) |
255 | std 0,24+32(3) |
256 | |
257 | ld 6,0+64(11) |
258 | ld 7,8+64(11) |
259 | std 6,0+64(10) |
260 | std 7,8+64(10) |
261 | ld 8,16+64(11) |
262 | ld 0,24+64(11) |
263 | std 8,16+64(10) |
264 | std 0,24+64(10) |
265 | |
266 | ld 6,0+96(11) |
267 | ld 7,8+96(11) |
268 | addi 4,4,128 |
269 | addi 3,3,128 |
270 | std 6,0+96(10) |
271 | std 7,8+96(10) |
272 | ld 8,16+96(11) |
273 | ld 0,24+96(11) |
274 | std 8,16+96(10) |
275 | std 0,24+96(10) |
276 | ble cr5,L(das_loop_e) |
277 | |
278 | mtctr 12 |
279 | .align 4 |
280 | L(das_loop2): |
281 | ld 6,0(4) |
282 | ld 7,8(4) |
283 | mr 10,3 |
284 | mr 11,4 |
285 | std 6,0(3) |
286 | std 7,8(3) |
287 | ld 8,16(4) |
288 | ld 0,24(4) |
289 | std 8,16(3) |
290 | std 0,24(3) |
291 | |
292 | ld 6,0+32(4) |
293 | ld 7,8+32(4) |
294 | std 6,0+32(3) |
295 | std 7,8+32(3) |
296 | ld 8,16+32(4) |
297 | ld 0,24+32(4) |
298 | std 8,16+32(3) |
299 | std 0,24+32(3) |
300 | |
301 | ld 6,0+64(11) |
302 | ld 7,8+64(11) |
303 | std 6,0+64(10) |
304 | std 7,8+64(10) |
305 | ld 8,16+64(11) |
306 | ld 0,24+64(11) |
307 | std 8,16+64(10) |
308 | std 0,24+64(10) |
309 | |
310 | ld 6,0+96(11) |
311 | ld 7,8+96(11) |
312 | addi 4,4,128 |
313 | addi 3,3,128 |
314 | std 6,0+96(10) |
315 | std 7,8+96(10) |
316 | ld 8,16+96(11) |
317 | ld 0,24+96(11) |
318 | std 8,16+96(10) |
319 | std 0,24+96(10) |
320 | bdnz L(das_loop2) |
321 | L(das_loop_e): |
322 | /* Check of a 1-7 byte tail, return if none. */ |
323 | bne cr1,L(das_tail2) |
324 | /* Return original dst pointer. */ |
325 | ld 3,-16(1) |
326 | blr |
327 | .align 4 |
328 | L(das_tail): |
329 | beq cr1,0f |
330 | |
331 | L(das_tail2): |
332 | /* At this point we have a tail of 0-7 bytes and we know that the |
333 | destination is double word aligned. */ |
334 | 4: bf 29,2f |
335 | lwz 6,0(4) |
336 | stw 6,0(3) |
337 | bf 30,5f |
338 | lhz 6,4(4) |
339 | sth 6,4(3) |
340 | bf 31,0f |
341 | lbz 6,6(4) |
342 | stb 6,6(3) |
343 | b 0f |
344 | 5: bf 31,0f |
345 | lbz 6,4(4) |
346 | stb 6,4(3) |
347 | b 0f |
348 | |
349 | 2: bf 30,1f |
350 | lhz 6,0(4) |
351 | sth 6,0(3) |
352 | bf 31,0f |
353 | lbz 6,2(4) |
354 | stb 6,2(3) |
355 | b 0f |
356 | |
357 | 1: bf 31,0f |
358 | lbz 6,0(4) |
359 | stb 6,0(3) |
360 | 0: |
361 | /* Return original dst pointer. */ |
362 | ld 3,-16(1) |
363 | blr |
364 | |
365 | /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 |
366 | bytes. Each case is handled without loops, using binary (1,2,4,8) |
367 | tests. |
368 | |
369 | In the short (0-8 byte) case no attempt is made to force alignment |
370 | of either source or destination. The hardware will handle the |
371 | unaligned load/stores with small delays for crossing 32- 128-byte, |
372 | and 4096-byte boundaries. Since these short moves are unlikely to be |
373 | unaligned or cross these boundaries, the overhead to force |
374 | alignment is not justified. |
375 | |
376 | The longer (9-31 byte) move is more likely to cross 32- or 128-byte |
377 | boundaries. Since only loads are sensitive to the 32-/128-byte |
378 | boundaries it is more important to align the source then the |
379 | destination. If the source is not already word aligned, we first |
380 | move 1-3 bytes as needed. Since we are only word aligned we don't |
381 | use double word load/stores to insure that all loads are aligned. |
382 | While the destination and stores may still be unaligned, this |
383 | is only an issue for page (4096 byte boundary) crossing, which |
384 | should be rare for these short moves. The hardware handles this |
385 | case automatically with a small (~20 cycle) delay. */ |
386 | .align 4 |
387 | .L2: |
388 | mtcrf 0x01,5 |
389 | neg 8,4 |
390 | clrrdi 11,4,2 |
391 | andi. 0,8,3 |
392 | ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ |
393 | /* At least 9 bytes left. Get the source word aligned. */ |
394 | cmpldi cr1,5,16 |
395 | mr 10,5 |
396 | mr 12,4 |
397 | cmpldi cr6,0,2 |
398 | beq L(dus_tail) /* If the source is already word aligned skip this. */ |
399 | /* Copy 1-3 bytes to get source address word aligned. */ |
400 | lwz 6,0(11) |
401 | subf 10,0,5 |
402 | add 12,4,0 |
403 | blt cr6,5f |
404 | srdi 7,6,16 |
405 | bgt cr6,3f |
406 | #ifdef __LITTLE_ENDIAN__ |
407 | sth 7,0(3) |
408 | #else |
409 | sth 6,0(3) |
410 | #endif |
411 | b 7f |
412 | .align 4 |
413 | 3: |
414 | #ifdef __LITTLE_ENDIAN__ |
415 | rotlwi 6,6,24 |
416 | stb 6,0(3) |
417 | sth 7,1(3) |
418 | #else |
419 | stb 7,0(3) |
420 | sth 6,1(3) |
421 | #endif |
422 | b 7f |
423 | .align 4 |
424 | 5: |
425 | #ifdef __LITTLE_ENDIAN__ |
426 | rotlwi 6,6,8 |
427 | #endif |
428 | stb 6,0(3) |
429 | 7: |
430 | cmpldi cr1,10,16 |
431 | add 3,3,0 |
432 | mtcrf 0x01,10 |
433 | .align 4 |
434 | L(dus_tail): |
435 | /* At least 6 bytes left and the source is word aligned. This allows |
436 | some speculative loads up front. */ |
437 | /* We need to special case the fall-through because the biggest delays |
438 | are due to address computation not being ready in time for the |
439 | AGEN. */ |
440 | lwz 6,0(12) |
441 | lwz 7,4(12) |
442 | blt cr1,L(dus_tail8) |
443 | cmpldi cr0,10,24 |
444 | L(dus_tail16): /* Move 16 bytes. */ |
445 | stw 6,0(3) |
446 | stw 7,4(3) |
447 | lwz 6,8(12) |
448 | lwz 7,12(12) |
449 | stw 6,8(3) |
450 | stw 7,12(3) |
451 | /* Move 8 bytes more. */ |
452 | bf 28,L(dus_tail16p8) |
453 | cmpldi cr1,10,28 |
454 | lwz 6,16(12) |
455 | lwz 7,20(12) |
456 | stw 6,16(3) |
457 | stw 7,20(3) |
458 | /* Move 4 bytes more. */ |
459 | bf 29,L(dus_tail16p4) |
460 | lwz 6,24(12) |
461 | stw 6,24(3) |
462 | addi 12,12,28 |
463 | addi 3,3,28 |
464 | bgt cr1,L(dus_tail2) |
465 | /* exactly 28 bytes. Return original dst pointer and exit. */ |
466 | ld 3,-16(1) |
467 | blr |
468 | .align 4 |
469 | L(dus_tail16p8): /* less than 8 bytes left. */ |
470 | beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */ |
471 | cmpldi cr1,10,20 |
472 | bf 29,L(dus_tail16p2) |
473 | /* Move 4 bytes more. */ |
474 | lwz 6,16(12) |
475 | stw 6,16(3) |
476 | addi 12,12,20 |
477 | addi 3,3,20 |
478 | bgt cr1,L(dus_tail2) |
479 | /* exactly 20 bytes. Return original dst pointer and exit. */ |
480 | ld 3,-16(1) |
481 | blr |
482 | .align 4 |
483 | L(dus_tail16p4): /* less than 4 bytes left. */ |
484 | addi 12,12,24 |
485 | addi 3,3,24 |
486 | bgt cr0,L(dus_tail2) |
487 | /* exactly 24 bytes. Return original dst pointer and exit. */ |
488 | ld 3,-16(1) |
489 | blr |
490 | .align 4 |
491 | L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */ |
492 | addi 12,12,16 |
493 | addi 3,3,16 |
494 | b L(dus_tail2) |
495 | |
496 | .align 4 |
497 | L(dus_tail8): /* Move 8 bytes. */ |
498 | /* r6, r7 already loaded speculatively. */ |
499 | cmpldi cr1,10,8 |
500 | cmpldi cr0,10,12 |
501 | bf 28,L(dus_tail4) |
502 | .align 2 |
503 | stw 6,0(3) |
504 | stw 7,4(3) |
505 | /* Move 4 bytes more. */ |
506 | bf 29,L(dus_tail8p4) |
507 | lwz 6,8(12) |
508 | stw 6,8(3) |
509 | addi 12,12,12 |
510 | addi 3,3,12 |
511 | bgt cr0,L(dus_tail2) |
512 | /* exactly 12 bytes. Return original dst pointer and exit. */ |
513 | ld 3,-16(1) |
514 | blr |
515 | .align 4 |
516 | L(dus_tail8p4): /* less than 4 bytes left. */ |
517 | addi 12,12,8 |
518 | addi 3,3,8 |
519 | bgt cr1,L(dus_tail2) |
520 | /* exactly 8 bytes. Return original dst pointer and exit. */ |
521 | ld 3,-16(1) |
522 | blr |
523 | |
524 | .align 4 |
525 | L(dus_tail4): /* Move 4 bytes. */ |
526 | /* r6 already loaded speculatively. If we are here we know there is |
527 | more than 4 bytes left. So there is no need to test. */ |
528 | addi 12,12,4 |
529 | stw 6,0(3) |
530 | addi 3,3,4 |
531 | L(dus_tail2): /* Move 2-3 bytes. */ |
532 | bf 30,L(dus_tail1) |
533 | lhz 6,0(12) |
534 | sth 6,0(3) |
535 | bf 31,L(dus_tailX) |
536 | lbz 7,2(12) |
537 | stb 7,2(3) |
538 | ld 3,-16(1) |
539 | blr |
540 | L(dus_tail1): /* Move 1 byte. */ |
541 | bf 31,L(dus_tailX) |
542 | lbz 6,0(12) |
543 | stb 6,0(3) |
544 | L(dus_tailX): |
545 | /* Return original dst pointer. */ |
546 | ld 3,-16(1) |
547 | blr |
548 | |
549 | /* Special case to copy 0-8 bytes. */ |
550 | .align 4 |
551 | .LE8: |
552 | mr 12,4 |
553 | bne cr6,L(dus_4) |
554 | /* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20 |
555 | cycle delay. This case should be rare and any attempt to avoid this |
556 | would take most of 20 cycles any way. */ |
557 | ld 6,0(4) |
558 | std 6,0(3) |
559 | /* Return original dst pointer. */ |
560 | ld 3,-16(1) |
561 | blr |
562 | .align 4 |
563 | L(dus_4): |
564 | bf 29,L(dus_tail2) |
565 | lwz 6,0(4) |
566 | stw 6,0(3) |
567 | bf 30,L(dus_5) |
568 | lhz 7,4(4) |
569 | sth 7,4(3) |
570 | bf 31,L(dus_0) |
571 | lbz 8,6(4) |
572 | stb 8,6(3) |
573 | ld 3,-16(1) |
574 | blr |
575 | .align 4 |
576 | L(dus_5): |
577 | bf 31,L(dus_0) |
578 | lbz 6,4(4) |
579 | stb 6,4(3) |
580 | L(dus_0): |
581 | /* Return original dst pointer. */ |
582 | ld 3,-16(1) |
583 | blr |
584 | |
585 | .align 4 |
586 | .L6: |
587 | cfi_offset(31,-8) |
588 | mr 12,4 |
589 | mr 31,5 |
590 | /* Copy doublewords where the destination is aligned but the source is |
591 | not. Use aligned doubleword loads from the source, shifted to realign |
592 | the data, to allow aligned destination stores. */ |
593 | addi 11,9,-1 /* loop DW count is one less than total */ |
594 | subf 5,10,12 /* Move source addr to previous full double word. */ |
595 | cmpldi cr5, 10, 2 |
596 | cmpldi cr0, 10, 4 |
597 | mr 4,3 |
598 | srdi 8,11,2 /* calculate the 32 byte loop count */ |
599 | ld 6,0(5) /* pre load 1st full doubleword. */ |
600 | mtcrf 0x01,11 |
601 | cmpldi cr6,9,4 |
602 | mtctr 8 |
603 | ld 7,8(5) /* pre load 2nd full doubleword. */ |
604 | bge cr0, L(du4_do) |
605 | blt cr5, L(du1_do) |
606 | beq cr5, L(du2_do) |
607 | b L(du3_do) |
608 | |
609 | .align 4 |
610 | L(du1_do): |
611 | bf 30,L(du1_1dw) |
612 | |
613 | /* there are at least two DWs to copy */ |
614 | /* FIXME: can combine last shift and "or" into "rldimi" */ |
615 | #ifdef __LITTLE_ENDIAN__ |
616 | srdi 0,6, 8 |
617 | sldi 8,7, 64-8 |
618 | #else |
619 | sldi 0,6, 8 |
620 | srdi 8,7, 64-8 |
621 | #endif |
622 | or 0,0,8 |
623 | ld 6,16(5) |
624 | std 0,0(4) |
625 | #ifdef __LITTLE_ENDIAN__ |
626 | srdi 0,7, 8 |
627 | sldi 8,6, 64-8 |
628 | #else |
629 | sldi 0,7, 8 |
630 | srdi 8,6, 64-8 |
631 | #endif |
632 | or 0,0,8 |
633 | ld 7,24(5) |
634 | std 0,8(4) |
635 | addi 4,4,16 |
636 | addi 5,5,32 |
637 | blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */ |
638 | bf 31,L(du1_loop) |
639 | /* there is a third DW to copy */ |
640 | #ifdef __LITTLE_ENDIAN__ |
641 | srdi 0,6, 8 |
642 | sldi 8,7, 64-8 |
643 | #else |
644 | sldi 0,6, 8 |
645 | srdi 8,7, 64-8 |
646 | #endif |
647 | or 0,0,8 |
648 | std 0,0(4) |
649 | mr 6,7 |
650 | ld 7,0(5) |
651 | addi 5,5,8 |
652 | addi 4,4,8 |
653 | beq cr6,L(du1_fini) /* if total DWs = 4, then bypass loop */ |
654 | b L(du1_loop) |
655 | .align 4 |
656 | L(du1_1dw): |
657 | #ifdef __LITTLE_ENDIAN__ |
658 | srdi 0,6, 8 |
659 | sldi 8,7, 64-8 |
660 | #else |
661 | sldi 0,6, 8 |
662 | srdi 8,7, 64-8 |
663 | #endif |
664 | addi 5,5,16 |
665 | or 0,0,8 |
666 | bf 31,L(du1_loop) |
667 | mr 6,7 |
668 | ld 7,0(5) |
669 | addi 5,5,8 |
670 | std 0,0(4) |
671 | addi 4,4,8 |
672 | .align 4 |
673 | /* copy 32 bytes at a time */ |
674 | L(du1_loop): |
675 | #ifdef __LITTLE_ENDIAN__ |
676 | srdi 0,6, 8 |
677 | sldi 8,7, 64-8 |
678 | #else |
679 | sldi 0,6, 8 |
680 | srdi 8,7, 64-8 |
681 | #endif |
682 | or 0,0,8 |
683 | ld 6,0(5) |
684 | std 0,0(4) |
685 | #ifdef __LITTLE_ENDIAN__ |
686 | srdi 0,7, 8 |
687 | sldi 8,6, 64-8 |
688 | #else |
689 | sldi 0,7, 8 |
690 | srdi 8,6, 64-8 |
691 | #endif |
692 | or 0,0,8 |
693 | ld 7,8(5) |
694 | std 0,8(4) |
695 | #ifdef __LITTLE_ENDIAN__ |
696 | srdi 0,6, 8 |
697 | sldi 8,7, 64-8 |
698 | #else |
699 | sldi 0,6, 8 |
700 | srdi 8,7, 64-8 |
701 | #endif |
702 | or 0,0,8 |
703 | ld 6,16(5) |
704 | std 0,16(4) |
705 | #ifdef __LITTLE_ENDIAN__ |
706 | srdi 0,7, 8 |
707 | sldi 8,6, 64-8 |
708 | #else |
709 | sldi 0,7, 8 |
710 | srdi 8,6, 64-8 |
711 | #endif |
712 | or 0,0,8 |
713 | ld 7,24(5) |
714 | std 0,24(4) |
715 | addi 5,5,32 |
716 | addi 4,4,32 |
717 | bdnz+ L(du1_loop) |
718 | .align 4 |
719 | L(du1_fini): |
720 | /* calculate and store the final DW */ |
721 | #ifdef __LITTLE_ENDIAN__ |
722 | srdi 0,6, 8 |
723 | sldi 8,7, 64-8 |
724 | #else |
725 | sldi 0,6, 8 |
726 | srdi 8,7, 64-8 |
727 | #endif |
728 | or 0,0,8 |
729 | std 0,0(4) |
730 | b L(du_done) |
731 | |
732 | .align 4 |
733 | L(du2_do): |
734 | bf 30,L(du2_1dw) |
735 | |
736 | /* there are at least two DWs to copy */ |
737 | #ifdef __LITTLE_ENDIAN__ |
738 | srdi 0,6, 16 |
739 | sldi 8,7, 64-16 |
740 | #else |
741 | sldi 0,6, 16 |
742 | srdi 8,7, 64-16 |
743 | #endif |
744 | or 0,0,8 |
745 | ld 6,16(5) |
746 | std 0,0(4) |
747 | #ifdef __LITTLE_ENDIAN__ |
748 | srdi 0,7, 16 |
749 | sldi 8,6, 64-16 |
750 | #else |
751 | sldi 0,7, 16 |
752 | srdi 8,6, 64-16 |
753 | #endif |
754 | or 0,0,8 |
755 | ld 7,24(5) |
756 | std 0,8(4) |
757 | addi 4,4,16 |
758 | addi 5,5,32 |
759 | blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */ |
760 | bf 31,L(du2_loop) |
761 | /* there is a third DW to copy */ |
762 | #ifdef __LITTLE_ENDIAN__ |
763 | srdi 0,6, 16 |
764 | sldi 8,7, 64-16 |
765 | #else |
766 | sldi 0,6, 16 |
767 | srdi 8,7, 64-16 |
768 | #endif |
769 | or 0,0,8 |
770 | std 0,0(4) |
771 | mr 6,7 |
772 | ld 7,0(5) |
773 | addi 5,5,8 |
774 | addi 4,4,8 |
775 | beq cr6,L(du2_fini) /* if total DWs = 4, then bypass loop */ |
776 | b L(du2_loop) |
777 | .align 4 |
778 | L(du2_1dw): |
779 | #ifdef __LITTLE_ENDIAN__ |
780 | srdi 0,6, 16 |
781 | sldi 8,7, 64-16 |
782 | #else |
783 | sldi 0,6, 16 |
784 | srdi 8,7, 64-16 |
785 | #endif |
786 | addi 5,5,16 |
787 | or 0,0,8 |
788 | bf 31,L(du2_loop) |
789 | mr 6,7 |
790 | ld 7,0(5) |
791 | addi 5,5,8 |
792 | std 0,0(4) |
793 | addi 4,4,8 |
794 | .align 4 |
795 | /* copy 32 bytes at a time */ |
796 | L(du2_loop): |
797 | #ifdef __LITTLE_ENDIAN__ |
798 | srdi 0,6, 16 |
799 | sldi 8,7, 64-16 |
800 | #else |
801 | sldi 0,6, 16 |
802 | srdi 8,7, 64-16 |
803 | #endif |
804 | or 0,0,8 |
805 | ld 6,0(5) |
806 | std 0,0(4) |
807 | #ifdef __LITTLE_ENDIAN__ |
808 | srdi 0,7, 16 |
809 | sldi 8,6, 64-16 |
810 | #else |
811 | sldi 0,7, 16 |
812 | srdi 8,6, 64-16 |
813 | #endif |
814 | or 0,0,8 |
815 | ld 7,8(5) |
816 | std 0,8(4) |
817 | #ifdef __LITTLE_ENDIAN__ |
818 | srdi 0,6, 16 |
819 | sldi 8,7, 64-16 |
820 | #else |
821 | sldi 0,6, 16 |
822 | srdi 8,7, 64-16 |
823 | #endif |
824 | or 0,0,8 |
825 | ld 6,16(5) |
826 | std 0,16(4) |
827 | #ifdef __LITTLE_ENDIAN__ |
828 | srdi 0,7, 16 |
829 | sldi 8,6, 64-16 |
830 | #else |
831 | sldi 0,7, 16 |
832 | srdi 8,6, 64-16 |
833 | #endif |
834 | or 0,0,8 |
835 | ld 7,24(5) |
836 | std 0,24(4) |
837 | addi 5,5,32 |
838 | addi 4,4,32 |
839 | bdnz+ L(du2_loop) |
840 | .align 4 |
841 | L(du2_fini): |
842 | /* calculate and store the final DW */ |
843 | #ifdef __LITTLE_ENDIAN__ |
844 | srdi 0,6, 16 |
845 | sldi 8,7, 64-16 |
846 | #else |
847 | sldi 0,6, 16 |
848 | srdi 8,7, 64-16 |
849 | #endif |
850 | or 0,0,8 |
851 | std 0,0(4) |
852 | b L(du_done) |
853 | |
854 | .align 4 |
855 | L(du3_do): |
856 | bf 30,L(du3_1dw) |
857 | |
858 | /* there are at least two DWs to copy */ |
859 | #ifdef __LITTLE_ENDIAN__ |
860 | srdi 0,6, 24 |
861 | sldi 8,7, 64-24 |
862 | #else |
863 | sldi 0,6, 24 |
864 | srdi 8,7, 64-24 |
865 | #endif |
866 | or 0,0,8 |
867 | ld 6,16(5) |
868 | std 0,0(4) |
869 | #ifdef __LITTLE_ENDIAN__ |
870 | srdi 0,7, 24 |
871 | sldi 8,6, 64-24 |
872 | #else |
873 | sldi 0,7, 24 |
874 | srdi 8,6, 64-24 |
875 | #endif |
876 | or 0,0,8 |
877 | ld 7,24(5) |
878 | std 0,8(4) |
879 | addi 4,4,16 |
880 | addi 5,5,32 |
881 | blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */ |
882 | bf 31,L(du3_loop) |
883 | /* there is a third DW to copy */ |
884 | #ifdef __LITTLE_ENDIAN__ |
885 | srdi 0,6, 24 |
886 | sldi 8,7, 64-24 |
887 | #else |
888 | sldi 0,6, 24 |
889 | srdi 8,7, 64-24 |
890 | #endif |
891 | or 0,0,8 |
892 | std 0,0(4) |
893 | mr 6,7 |
894 | ld 7,0(5) |
895 | addi 5,5,8 |
896 | addi 4,4,8 |
897 | beq cr6,L(du3_fini) /* if total DWs = 4, then bypass loop */ |
898 | b L(du3_loop) |
899 | .align 4 |
900 | L(du3_1dw): |
901 | #ifdef __LITTLE_ENDIAN__ |
902 | srdi 0,6, 24 |
903 | sldi 8,7, 64-24 |
904 | #else |
905 | sldi 0,6, 24 |
906 | srdi 8,7, 64-24 |
907 | #endif |
908 | addi 5,5,16 |
909 | or 0,0,8 |
910 | bf 31,L(du3_loop) |
911 | mr 6,7 |
912 | ld 7,0(5) |
913 | addi 5,5,8 |
914 | std 0,0(4) |
915 | addi 4,4,8 |
916 | .align 4 |
917 | /* copy 32 bytes at a time */ |
918 | L(du3_loop): |
919 | #ifdef __LITTLE_ENDIAN__ |
920 | srdi 0,6, 24 |
921 | sldi 8,7, 64-24 |
922 | #else |
923 | sldi 0,6, 24 |
924 | srdi 8,7, 64-24 |
925 | #endif |
926 | or 0,0,8 |
927 | ld 6,0(5) |
928 | std 0,0(4) |
929 | #ifdef __LITTLE_ENDIAN__ |
930 | srdi 0,7, 24 |
931 | sldi 8,6, 64-24 |
932 | #else |
933 | sldi 0,7, 24 |
934 | srdi 8,6, 64-24 |
935 | #endif |
936 | or 0,0,8 |
937 | ld 7,8(5) |
938 | std 0,8(4) |
939 | #ifdef __LITTLE_ENDIAN__ |
940 | srdi 0,6, 24 |
941 | sldi 8,7, 64-24 |
942 | #else |
943 | sldi 0,6, 24 |
944 | srdi 8,7, 64-24 |
945 | #endif |
946 | or 0,0,8 |
947 | ld 6,16(5) |
948 | std 0,16(4) |
949 | #ifdef __LITTLE_ENDIAN__ |
950 | srdi 0,7, 24 |
951 | sldi 8,6, 64-24 |
952 | #else |
953 | sldi 0,7, 24 |
954 | srdi 8,6, 64-24 |
955 | #endif |
956 | or 0,0,8 |
957 | ld 7,24(5) |
958 | std 0,24(4) |
959 | addi 5,5,32 |
960 | addi 4,4,32 |
961 | bdnz+ L(du3_loop) |
962 | .align 4 |
963 | L(du3_fini): |
964 | /* calculate and store the final DW */ |
965 | #ifdef __LITTLE_ENDIAN__ |
966 | srdi 0,6, 24 |
967 | sldi 8,7, 64-24 |
968 | #else |
969 | sldi 0,6, 24 |
970 | srdi 8,7, 64-24 |
971 | #endif |
972 | or 0,0,8 |
973 | std 0,0(4) |
974 | b L(du_done) |
975 | |
976 | .align 4 |
977 | L(du4_do): |
978 | cmpldi cr5, 10, 6 |
979 | beq cr0, L(du4_dox) |
980 | blt cr5, L(du5_do) |
981 | beq cr5, L(du6_do) |
982 | b L(du7_do) |
983 | L(du4_dox): |
984 | bf 30,L(du4_1dw) |
985 | |
986 | /* there are at least two DWs to copy */ |
987 | #ifdef __LITTLE_ENDIAN__ |
988 | srdi 0,6, 32 |
989 | sldi 8,7, 64-32 |
990 | #else |
991 | sldi 0,6, 32 |
992 | srdi 8,7, 64-32 |
993 | #endif |
994 | or 0,0,8 |
995 | ld 6,16(5) |
996 | std 0,0(4) |
997 | #ifdef __LITTLE_ENDIAN__ |
998 | srdi 0,7, 32 |
999 | sldi 8,6, 64-32 |
1000 | #else |
1001 | sldi 0,7, 32 |
1002 | srdi 8,6, 64-32 |
1003 | #endif |
1004 | or 0,0,8 |
1005 | ld 7,24(5) |
1006 | std 0,8(4) |
1007 | addi 4,4,16 |
1008 | addi 5,5,32 |
1009 | blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */ |
1010 | bf 31,L(du4_loop) |
1011 | /* there is a third DW to copy */ |
1012 | #ifdef __LITTLE_ENDIAN__ |
1013 | srdi 0,6, 32 |
1014 | sldi 8,7, 64-32 |
1015 | #else |
1016 | sldi 0,6, 32 |
1017 | srdi 8,7, 64-32 |
1018 | #endif |
1019 | or 0,0,8 |
1020 | std 0,0(4) |
1021 | mr 6,7 |
1022 | ld 7,0(5) |
1023 | addi 5,5,8 |
1024 | addi 4,4,8 |
1025 | beq cr6,L(du4_fini) /* if total DWs = 4, then bypass loop */ |
1026 | b L(du4_loop) |
1027 | .align 4 |
1028 | L(du4_1dw): |
1029 | #ifdef __LITTLE_ENDIAN__ |
1030 | srdi 0,6, 32 |
1031 | sldi 8,7, 64-32 |
1032 | #else |
1033 | sldi 0,6, 32 |
1034 | srdi 8,7, 64-32 |
1035 | #endif |
1036 | addi 5,5,16 |
1037 | or 0,0,8 |
1038 | bf 31,L(du4_loop) |
1039 | mr 6,7 |
1040 | ld 7,0(5) |
1041 | addi 5,5,8 |
1042 | std 0,0(4) |
1043 | addi 4,4,8 |
1044 | .align 4 |
1045 | /* copy 32 bytes at a time */ |
1046 | L(du4_loop): |
1047 | #ifdef __LITTLE_ENDIAN__ |
1048 | srdi 0,6, 32 |
1049 | sldi 8,7, 64-32 |
1050 | #else |
1051 | sldi 0,6, 32 |
1052 | srdi 8,7, 64-32 |
1053 | #endif |
1054 | or 0,0,8 |
1055 | ld 6,0(5) |
1056 | std 0,0(4) |
1057 | #ifdef __LITTLE_ENDIAN__ |
1058 | srdi 0,7, 32 |
1059 | sldi 8,6, 64-32 |
1060 | #else |
1061 | sldi 0,7, 32 |
1062 | srdi 8,6, 64-32 |
1063 | #endif |
1064 | or 0,0,8 |
1065 | ld 7,8(5) |
1066 | std 0,8(4) |
1067 | #ifdef __LITTLE_ENDIAN__ |
1068 | srdi 0,6, 32 |
1069 | sldi 8,7, 64-32 |
1070 | #else |
1071 | sldi 0,6, 32 |
1072 | srdi 8,7, 64-32 |
1073 | #endif |
1074 | or 0,0,8 |
1075 | ld 6,16(5) |
1076 | std 0,16(4) |
1077 | #ifdef __LITTLE_ENDIAN__ |
1078 | srdi 0,7, 32 |
1079 | sldi 8,6, 64-32 |
1080 | #else |
1081 | sldi 0,7, 32 |
1082 | srdi 8,6, 64-32 |
1083 | #endif |
1084 | or 0,0,8 |
1085 | ld 7,24(5) |
1086 | std 0,24(4) |
1087 | addi 5,5,32 |
1088 | addi 4,4,32 |
1089 | bdnz+ L(du4_loop) |
1090 | .align 4 |
1091 | L(du4_fini): |
1092 | /* calculate and store the final DW */ |
1093 | #ifdef __LITTLE_ENDIAN__ |
1094 | srdi 0,6, 32 |
1095 | sldi 8,7, 64-32 |
1096 | #else |
1097 | sldi 0,6, 32 |
1098 | srdi 8,7, 64-32 |
1099 | #endif |
1100 | or 0,0,8 |
1101 | std 0,0(4) |
1102 | b L(du_done) |
1103 | |
1104 | .align 4 |
1105 | L(du5_do): |
1106 | bf 30,L(du5_1dw) |
1107 | |
1108 | /* there are at least two DWs to copy */ |
1109 | #ifdef __LITTLE_ENDIAN__ |
1110 | srdi 0,6, 40 |
1111 | sldi 8,7, 64-40 |
1112 | #else |
1113 | sldi 0,6, 40 |
1114 | srdi 8,7, 64-40 |
1115 | #endif |
1116 | or 0,0,8 |
1117 | ld 6,16(5) |
1118 | std 0,0(4) |
1119 | #ifdef __LITTLE_ENDIAN__ |
1120 | srdi 0,7, 40 |
1121 | sldi 8,6, 64-40 |
1122 | #else |
1123 | sldi 0,7, 40 |
1124 | srdi 8,6, 64-40 |
1125 | #endif |
1126 | or 0,0,8 |
1127 | ld 7,24(5) |
1128 | std 0,8(4) |
1129 | addi 4,4,16 |
1130 | addi 5,5,32 |
1131 | blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */ |
1132 | bf 31,L(du5_loop) |
1133 | /* there is a third DW to copy */ |
1134 | #ifdef __LITTLE_ENDIAN__ |
1135 | srdi 0,6, 40 |
1136 | sldi 8,7, 64-40 |
1137 | #else |
1138 | sldi 0,6, 40 |
1139 | srdi 8,7, 64-40 |
1140 | #endif |
1141 | or 0,0,8 |
1142 | std 0,0(4) |
1143 | mr 6,7 |
1144 | ld 7,0(5) |
1145 | addi 5,5,8 |
1146 | addi 4,4,8 |
1147 | beq cr6,L(du5_fini) /* if total DWs = 4, then bypass loop */ |
1148 | b L(du5_loop) |
1149 | .align 4 |
1150 | L(du5_1dw): |
1151 | #ifdef __LITTLE_ENDIAN__ |
1152 | srdi 0,6, 40 |
1153 | sldi 8,7, 64-40 |
1154 | #else |
1155 | sldi 0,6, 40 |
1156 | srdi 8,7, 64-40 |
1157 | #endif |
1158 | addi 5,5,16 |
1159 | or 0,0,8 |
1160 | bf 31,L(du5_loop) |
1161 | mr 6,7 |
1162 | ld 7,0(5) |
1163 | addi 5,5,8 |
1164 | std 0,0(4) |
1165 | addi 4,4,8 |
1166 | .align 4 |
1167 | /* copy 32 bytes at a time */ |
1168 | L(du5_loop): |
1169 | #ifdef __LITTLE_ENDIAN__ |
1170 | srdi 0,6, 40 |
1171 | sldi 8,7, 64-40 |
1172 | #else |
1173 | sldi 0,6, 40 |
1174 | srdi 8,7, 64-40 |
1175 | #endif |
1176 | or 0,0,8 |
1177 | ld 6,0(5) |
1178 | std 0,0(4) |
1179 | #ifdef __LITTLE_ENDIAN__ |
1180 | srdi 0,7, 40 |
1181 | sldi 8,6, 64-40 |
1182 | #else |
1183 | sldi 0,7, 40 |
1184 | srdi 8,6, 64-40 |
1185 | #endif |
1186 | or 0,0,8 |
1187 | ld 7,8(5) |
1188 | std 0,8(4) |
1189 | #ifdef __LITTLE_ENDIAN__ |
1190 | srdi 0,6, 40 |
1191 | sldi 8,7, 64-40 |
1192 | #else |
1193 | sldi 0,6, 40 |
1194 | srdi 8,7, 64-40 |
1195 | #endif |
1196 | or 0,0,8 |
1197 | ld 6,16(5) |
1198 | std 0,16(4) |
1199 | #ifdef __LITTLE_ENDIAN__ |
1200 | srdi 0,7, 40 |
1201 | sldi 8,6, 64-40 |
1202 | #else |
1203 | sldi 0,7, 40 |
1204 | srdi 8,6, 64-40 |
1205 | #endif |
1206 | or 0,0,8 |
1207 | ld 7,24(5) |
1208 | std 0,24(4) |
1209 | addi 5,5,32 |
1210 | addi 4,4,32 |
1211 | bdnz+ L(du5_loop) |
1212 | .align 4 |
1213 | L(du5_fini): |
1214 | /* calculate and store the final DW */ |
1215 | #ifdef __LITTLE_ENDIAN__ |
1216 | srdi 0,6, 40 |
1217 | sldi 8,7, 64-40 |
1218 | #else |
1219 | sldi 0,6, 40 |
1220 | srdi 8,7, 64-40 |
1221 | #endif |
1222 | or 0,0,8 |
1223 | std 0,0(4) |
1224 | b L(du_done) |
1225 | |
1226 | .align 4 |
1227 | L(du6_do): |
1228 | bf 30,L(du6_1dw) |
1229 | |
1230 | /* there are at least two DWs to copy */ |
1231 | #ifdef __LITTLE_ENDIAN__ |
1232 | srdi 0,6, 48 |
1233 | sldi 8,7, 64-48 |
1234 | #else |
1235 | sldi 0,6, 48 |
1236 | srdi 8,7, 64-48 |
1237 | #endif |
1238 | or 0,0,8 |
1239 | ld 6,16(5) |
1240 | std 0,0(4) |
1241 | #ifdef __LITTLE_ENDIAN__ |
1242 | srdi 0,7, 48 |
1243 | sldi 8,6, 64-48 |
1244 | #else |
1245 | sldi 0,7, 48 |
1246 | srdi 8,6, 64-48 |
1247 | #endif |
1248 | or 0,0,8 |
1249 | ld 7,24(5) |
1250 | std 0,8(4) |
1251 | addi 4,4,16 |
1252 | addi 5,5,32 |
1253 | blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */ |
1254 | bf 31,L(du6_loop) |
1255 | /* there is a third DW to copy */ |
1256 | #ifdef __LITTLE_ENDIAN__ |
1257 | srdi 0,6, 48 |
1258 | sldi 8,7, 64-48 |
1259 | #else |
1260 | sldi 0,6, 48 |
1261 | srdi 8,7, 64-48 |
1262 | #endif |
1263 | or 0,0,8 |
1264 | std 0,0(4) |
1265 | mr 6,7 |
1266 | ld 7,0(5) |
1267 | addi 5,5,8 |
1268 | addi 4,4,8 |
1269 | beq cr6,L(du6_fini) /* if total DWs = 4, then bypass loop */ |
1270 | b L(du6_loop) |
1271 | .align 4 |
1272 | L(du6_1dw): |
1273 | #ifdef __LITTLE_ENDIAN__ |
1274 | srdi 0,6, 48 |
1275 | sldi 8,7, 64-48 |
1276 | #else |
1277 | sldi 0,6, 48 |
1278 | srdi 8,7, 64-48 |
1279 | #endif |
1280 | addi 5,5,16 |
1281 | or 0,0,8 |
1282 | bf 31,L(du6_loop) |
1283 | mr 6,7 |
1284 | ld 7,0(5) |
1285 | addi 5,5,8 |
1286 | std 0,0(4) |
1287 | addi 4,4,8 |
1288 | .align 4 |
1289 | /* copy 32 bytes at a time */ |
1290 | L(du6_loop): |
1291 | #ifdef __LITTLE_ENDIAN__ |
1292 | srdi 0,6, 48 |
1293 | sldi 8,7, 64-48 |
1294 | #else |
1295 | sldi 0,6, 48 |
1296 | srdi 8,7, 64-48 |
1297 | #endif |
1298 | or 0,0,8 |
1299 | ld 6,0(5) |
1300 | std 0,0(4) |
1301 | #ifdef __LITTLE_ENDIAN__ |
1302 | srdi 0,7, 48 |
1303 | sldi 8,6, 64-48 |
1304 | #else |
1305 | sldi 0,7, 48 |
1306 | srdi 8,6, 64-48 |
1307 | #endif |
1308 | or 0,0,8 |
1309 | ld 7,8(5) |
1310 | std 0,8(4) |
1311 | #ifdef __LITTLE_ENDIAN__ |
1312 | srdi 0,6, 48 |
1313 | sldi 8,7, 64-48 |
1314 | #else |
1315 | sldi 0,6, 48 |
1316 | srdi 8,7, 64-48 |
1317 | #endif |
1318 | or 0,0,8 |
1319 | ld 6,16(5) |
1320 | std 0,16(4) |
1321 | #ifdef __LITTLE_ENDIAN__ |
1322 | srdi 0,7, 48 |
1323 | sldi 8,6, 64-48 |
1324 | #else |
1325 | sldi 0,7, 48 |
1326 | srdi 8,6, 64-48 |
1327 | #endif |
1328 | or 0,0,8 |
1329 | ld 7,24(5) |
1330 | std 0,24(4) |
1331 | addi 5,5,32 |
1332 | addi 4,4,32 |
1333 | bdnz+ L(du6_loop) |
1334 | .align 4 |
1335 | L(du6_fini): |
1336 | /* calculate and store the final DW */ |
1337 | #ifdef __LITTLE_ENDIAN__ |
1338 | srdi 0,6, 48 |
1339 | sldi 8,7, 64-48 |
1340 | #else |
1341 | sldi 0,6, 48 |
1342 | srdi 8,7, 64-48 |
1343 | #endif |
1344 | or 0,0,8 |
1345 | std 0,0(4) |
1346 | b L(du_done) |
1347 | |
1348 | .align 4 |
1349 | L(du7_do): |
1350 | bf 30,L(du7_1dw) |
1351 | |
1352 | /* there are at least two DWs to copy */ |
1353 | #ifdef __LITTLE_ENDIAN__ |
1354 | srdi 0,6, 56 |
1355 | sldi 8,7, 64-56 |
1356 | #else |
1357 | sldi 0,6, 56 |
1358 | srdi 8,7, 64-56 |
1359 | #endif |
1360 | or 0,0,8 |
1361 | ld 6,16(5) |
1362 | std 0,0(4) |
1363 | #ifdef __LITTLE_ENDIAN__ |
1364 | srdi 0,7, 56 |
1365 | sldi 8,6, 64-56 |
1366 | #else |
1367 | sldi 0,7, 56 |
1368 | srdi 8,6, 64-56 |
1369 | #endif |
1370 | or 0,0,8 |
1371 | ld 7,24(5) |
1372 | std 0,8(4) |
1373 | addi 4,4,16 |
1374 | addi 5,5,32 |
1375 | blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */ |
1376 | bf 31,L(du7_loop) |
1377 | /* there is a third DW to copy */ |
1378 | #ifdef __LITTLE_ENDIAN__ |
1379 | srdi 0,6, 56 |
1380 | sldi 8,7, 64-56 |
1381 | #else |
1382 | sldi 0,6, 56 |
1383 | srdi 8,7, 64-56 |
1384 | #endif |
1385 | or 0,0,8 |
1386 | std 0,0(4) |
1387 | mr 6,7 |
1388 | ld 7,0(5) |
1389 | addi 5,5,8 |
1390 | addi 4,4,8 |
1391 | beq cr6,L(du7_fini) /* if total DWs = 4, then bypass loop */ |
1392 | b L(du7_loop) |
1393 | .align 4 |
1394 | L(du7_1dw): |
1395 | #ifdef __LITTLE_ENDIAN__ |
1396 | srdi 0,6, 56 |
1397 | sldi 8,7, 64-56 |
1398 | #else |
1399 | sldi 0,6, 56 |
1400 | srdi 8,7, 64-56 |
1401 | #endif |
1402 | addi 5,5,16 |
1403 | or 0,0,8 |
1404 | bf 31,L(du7_loop) |
1405 | mr 6,7 |
1406 | ld 7,0(5) |
1407 | addi 5,5,8 |
1408 | std 0,0(4) |
1409 | addi 4,4,8 |
1410 | .align 4 |
1411 | /* copy 32 bytes at a time */ |
1412 | L(du7_loop): |
1413 | #ifdef __LITTLE_ENDIAN__ |
1414 | srdi 0,6, 56 |
1415 | sldi 8,7, 64-56 |
1416 | #else |
1417 | sldi 0,6, 56 |
1418 | srdi 8,7, 64-56 |
1419 | #endif |
1420 | or 0,0,8 |
1421 | ld 6,0(5) |
1422 | std 0,0(4) |
1423 | #ifdef __LITTLE_ENDIAN__ |
1424 | srdi 0,7, 56 |
1425 | sldi 8,6, 64-56 |
1426 | #else |
1427 | sldi 0,7, 56 |
1428 | srdi 8,6, 64-56 |
1429 | #endif |
1430 | or 0,0,8 |
1431 | ld 7,8(5) |
1432 | std 0,8(4) |
1433 | #ifdef __LITTLE_ENDIAN__ |
1434 | srdi 0,6, 56 |
1435 | sldi 8,7, 64-56 |
1436 | #else |
1437 | sldi 0,6, 56 |
1438 | srdi 8,7, 64-56 |
1439 | #endif |
1440 | or 0,0,8 |
1441 | ld 6,16(5) |
1442 | std 0,16(4) |
1443 | #ifdef __LITTLE_ENDIAN__ |
1444 | srdi 0,7, 56 |
1445 | sldi 8,6, 64-56 |
1446 | #else |
1447 | sldi 0,7, 56 |
1448 | srdi 8,6, 64-56 |
1449 | #endif |
1450 | or 0,0,8 |
1451 | ld 7,24(5) |
1452 | std 0,24(4) |
1453 | addi 5,5,32 |
1454 | addi 4,4,32 |
1455 | bdnz+ L(du7_loop) |
1456 | .align 4 |
1457 | L(du7_fini): |
1458 | /* calculate and store the final DW */ |
1459 | #ifdef __LITTLE_ENDIAN__ |
1460 | srdi 0,6, 56 |
1461 | sldi 8,7, 64-56 |
1462 | #else |
1463 | sldi 0,6, 56 |
1464 | srdi 8,7, 64-56 |
1465 | #endif |
1466 | or 0,0,8 |
1467 | std 0,0(4) |
1468 | b L(du_done) |
1469 | |
1470 | .align 4 |
1471 | L(du_done): |
1472 | rldicr 0,31,0,60 |
1473 | mtcrf 0x01,31 |
1474 | beq cr1,0f /* If the tail is 0 bytes we are done! */ |
1475 | |
1476 | add 3,3,0 |
1477 | add 12,12,0 |
1478 | /* At this point we have a tail of 0-7 bytes and we know that the |
1479 | destination is double word aligned. */ |
1480 | 4: bf 29,2f |
1481 | lwz 6,0(12) |
1482 | addi 12,12,4 |
1483 | stw 6,0(3) |
1484 | addi 3,3,4 |
1485 | 2: bf 30,1f |
1486 | lhz 6,0(12) |
1487 | addi 12,12,2 |
1488 | sth 6,0(3) |
1489 | addi 3,3,2 |
1490 | 1: bf 31,0f |
1491 | lbz 6,0(12) |
1492 | stb 6,0(3) |
1493 | 0: |
1494 | /* Return original dst pointer. */ |
1495 | ld 31,-8(1) |
1496 | ld 3,-16(1) |
1497 | blr |
1498 | END_GEN_TB (MEMCPY,TB_TOCLESS) |
1499 | libc_hidden_builtin_def (memcpy) |
1500 | |