1/* Optimized memcpy implementation for PowerPC32 on PowerPC64.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22 Returns 'dst'.
23
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
28
29 Longer moves (>= 32-bytes) justify the effort to get at least the
30 destination word (4-byte) aligned. Further optimization is
31 possible when both source and destination are word aligned.
32 Each case has an optimized unrolled loop. */
33
34 .machine power4
35EALIGN (memcpy, 5, 0)
36 CALL_MCOUNT
37
38 stwu 1,-32(1)
39 cfi_adjust_cfa_offset(32)
40 stw 30,20(1)
41 cfi_offset(30,(20-32))
42 mr 30,3
43 cmplwi cr1,5,31
44 stw 31,24(1)
45 cfi_offset(31,(24-32))
46 neg 0,3
47 andi. 11,3,3 /* check alignment of dst. */
48 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
49 clrlwi 10,4,30 /* check alignment of src. */
50 cmplwi cr6,5,8
51 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
52 cmplw cr6,10,11
53 mr 12,4
54 srwi 9,5,2 /* Number of full words remaining. */
55 mtcrf 0x01,0
56 mr 31,5
57 beq .L0
58
59 subf 31,0,5
60 /* Move 0-3 bytes as needed to get the destination word aligned. */
611: bf 31,2f
62 lbz 6,0(12)
63 addi 12,12,1
64 stb 6,0(3)
65 addi 3,3,1
662: bf 30,0f
67 lhz 6,0(12)
68 addi 12,12,2
69 sth 6,0(3)
70 addi 3,3,2
710:
72 clrlwi 10,12,30 /* check alignment of src again. */
73 srwi 9,31,2 /* Number of full words remaining. */
74
75 /* Copy words from source to destination, assuming the destination is
76 aligned on a word boundary.
77
78 At this point we know there are at least 25 bytes left (32-7) to copy.
79 The next step is to determine if the source is also word aligned.
80 If not branch to the unaligned move code at .L6. which uses
81 a load, shift, store strategy.
82
83 Otherwise source and destination are word aligned, and we can use
84 the optimized word copy loop. */
85.L0:
86 clrlwi 11,31,30 /* calculate the number of tail bytes */
87 mtcrf 0x01,9
88 bne- cr6,.L6 /* If source is not word aligned. */
89
90 /* Move words where destination and source are word aligned.
91 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
92 If the copy is not an exact multiple of 16 bytes, 1-3
93 words are copied as needed to set up the main loop. After
94 the main loop exits there may be a tail of 1-3 bytes. These bytes are
95 copied a halfword/byte at a time as needed to preserve alignment. */
96
97 srwi 8,31,4 /* calculate the 16 byte loop count */
98 cmplwi cr1,9,4
99 cmplwi cr6,11,0
100 mr 11,12
101
102 bf 30,1f
103 lwz 6,0(12)
104 lwz 7,4(12)
105 addi 11,12,8
106 mtctr 8
107 stw 6,0(3)
108 stw 7,4(3)
109 addi 10,3,8
110 bf 31,4f
111 lwz 0,8(12)
112 stw 0,8(3)
113 blt cr1,3f
114 addi 11,12,12
115 addi 10,3,12
116 b 4f
117 .align 4
1181:
119 mr 10,3
120 mtctr 8
121 bf 31,4f
122 lwz 6,0(12)
123 addi 11,12,4
124 stw 6,0(3)
125 addi 10,3,4
126
127 .align 4
1284:
129 lwz 6,0(11)
130 lwz 7,4(11)
131 lwz 8,8(11)
132 lwz 0,12(11)
133 stw 6,0(10)
134 stw 7,4(10)
135 stw 8,8(10)
136 stw 0,12(10)
137 addi 11,11,16
138 addi 10,10,16
139 bdnz 4b
1403:
141 clrrwi 0,31,2
142 mtcrf 0x01,31
143 beq cr6,0f
144.L9:
145 add 3,3,0
146 add 12,12,0
147
148/* At this point we have a tail of 0-3 bytes and we know that the
149 destination is word aligned. */
1502: bf 30,1f
151 lhz 6,0(12)
152 addi 12,12,2
153 sth 6,0(3)
154 addi 3,3,2
1551: bf 31,0f
156 lbz 6,0(12)
157 stb 6,0(3)
1580:
159 /* Return original dst pointer. */
160 mr 3,30
161 lwz 30,20(1)
162 lwz 31,24(1)
163 addi 1,1,32
164 blr
165
166/* Copy up to 31 bytes. This is divided into two cases 0-8 bytes and
167 9-31 bytes. Each case is handled without loops, using binary
168 (1,2,4,8) tests.
169
170 In the short (0-8 byte) case no attempt is made to force alignment
171 of either source or destination. The hardware will handle the
172 unaligned load/stores with small delays for crossing 32- 64-byte, and
173 4096-byte boundaries. Since these short moves are unlikely to be
174 unaligned or cross these boundaries, the overhead to force
175 alignment is not justified.
176
177 The longer (9-31 byte) move is more likely to cross 32- or 64-byte
178 boundaries. Since only loads are sensitive to the 32-/64-byte
179 boundaries it is more important to align the source than the
180 destination. If the source is not already word aligned, we first
181 move 1-3 bytes as needed. While the destination and stores may
182 still be unaligned, this is only an issue for page (4096 byte
183 boundary) crossing, which should be rare for these short moves.
184 The hardware handles this case automatically with a small delay. */
185
186 .align 4
187.L2:
188 mtcrf 0x01,5
189 neg 8,4
190 clrrwi 11,4,2
191 andi. 0,8,3
192 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
193/* At least 9 bytes left. Get the source word aligned. */
194 cmplwi cr1,5,16
195 mr 10,5
196 mr 12,4
197 cmplwi cr6,0,2
198 beq .L3 /* If the source is already word aligned skip this. */
199/* Copy 1-3 bytes to get source address word aligned. */
200 lwz 6,0(11)
201 subf 10,0,5
202 add 12,4,0
203 blt cr6,5f
204 srwi 7,6,16
205 bgt cr6,3f
206#ifdef __LITTLE_ENDIAN__
207 sth 7,0(3)
208#else
209 sth 6,0(3)
210#endif
211 b 7f
212 .align 4
2133:
214#ifdef __LITTLE_ENDIAN__
215 rotlwi 6,6,24
216 stb 6,0(3)
217 sth 7,1(3)
218#else
219 stb 7,0(3)
220 sth 6,1(3)
221#endif
222 b 7f
223 .align 4
2245:
225#ifdef __LITTLE_ENDIAN__
226 rotlwi 6,6,8
227#endif
228 stb 6,0(3)
2297:
230 cmplwi cr1,10,16
231 add 3,3,0
232 mtcrf 0x01,10
233 .align 4
234.L3:
235/* At least 6 bytes left and the source is word aligned. */
236 blt cr1,8f
23716: /* Move 16 bytes. */
238 lwz 6,0(12)
239 lwz 7,4(12)
240 stw 6,0(3)
241 lwz 6,8(12)
242 stw 7,4(3)
243 lwz 7,12(12)
244 addi 12,12,16
245 stw 6,8(3)
246 stw 7,12(3)
247 addi 3,3,16
2488: /* Move 8 bytes. */
249 bf 28,4f
250 lwz 6,0(12)
251 lwz 7,4(12)
252 addi 12,12,8
253 stw 6,0(3)
254 stw 7,4(3)
255 addi 3,3,8
2564: /* Move 4 bytes. */
257 bf 29,2f
258 lwz 6,0(12)
259 addi 12,12,4
260 stw 6,0(3)
261 addi 3,3,4
2622: /* Move 2-3 bytes. */
263 bf 30,1f
264 lhz 6,0(12)
265 sth 6,0(3)
266 bf 31,0f
267 lbz 7,2(12)
268 stb 7,2(3)
269 mr 3,30
270 lwz 30,20(1)
271 addi 1,1,32
272 blr
2731: /* Move 1 byte. */
274 bf 31,0f
275 lbz 6,0(12)
276 stb 6,0(3)
2770:
278 /* Return original dst pointer. */
279 mr 3,30
280 lwz 30,20(1)
281 addi 1,1,32
282 blr
283
284/* Special case to copy 0-8 bytes. */
285 .align 4
286.LE8:
287 mr 12,4
288 bne cr6,4f
289 lwz 6,0(4)
290 lwz 7,4(4)
291 stw 6,0(3)
292 stw 7,4(3)
293 /* Return original dst pointer. */
294 mr 3,30
295 lwz 30,20(1)
296 addi 1,1,32
297 blr
298 .align 4
2994: bf 29,2b
300 lwz 6,0(4)
301 stw 6,0(3)
3026:
303 bf 30,5f
304 lhz 7,4(4)
305 sth 7,4(3)
306 bf 31,0f
307 lbz 8,6(4)
308 stb 8,6(3)
309 mr 3,30
310 lwz 30,20(1)
311 addi 1,1,32
312 blr
313 .align 4
3145:
315 bf 31,0f
316 lbz 6,4(4)
317 stb 6,4(3)
318 .align 4
3190:
320 /* Return original dst pointer. */
321 mr 3,30
322 lwz 30,20(1)
323 addi 1,1,32
324 blr
325
326 .align 4
327.L6:
328
329 /* Copy words where the destination is aligned but the source is
330 not. Use aligned word loads from the source, shifted to realign
331 the data, to allow aligned destination stores.
332 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
333 A single word is retained for storing at loop exit to avoid walking
334 off the end of a page within the loop.
335 If the copy is not an exact multiple of 16 bytes, 1-3
336 words are copied as needed to set up the main loop. After
337 the main loop exits there may be a tail of 1-3 bytes. These bytes are
338 copied a halfword/byte at a time as needed to preserve alignment. */
339
340
341 cmplwi cr6,11,0 /* are there tail bytes left ? */
342 subf 5,10,12 /* back up src pointer to prev word alignment */
343 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
344 addi 11,9,-1 /* we move one word after the loop */
345 srwi 8,11,2 /* calculate the 16 byte loop count */
346 lwz 6,0(5) /* load 1st src word into R6 */
347 mr 4,3
348 lwz 7,4(5) /* load 2nd src word into R7 */
349 mtcrf 0x01,11
350 subfic 9,10,32 /* number of bits to shift 2nd word right */
351 mtctr 8
352 bf 30,1f
353
354 /* there are at least two words to copy, so copy them */
355#ifdef __LITTLE_ENDIAN__
356 srw 0,6,10
357 slw 8,7,9
358#else
359 slw 0,6,10 /* shift 1st src word to left align it in R0 */
360 srw 8,7,9 /* shift 2nd src word to right align it in R8 */
361#endif
362 or 0,0,8 /* or them to get word to store */
363 lwz 6,8(5) /* load the 3rd src word */
364 stw 0,0(4) /* store the 1st dst word */
365#ifdef __LITTLE_ENDIAN__
366 srw 0,7,10
367 slw 8,6,9
368#else
369 slw 0,7,10 /* now left align 2nd src word into R0 */
370 srw 8,6,9 /* shift 3rd src word to right align it in R8 */
371#endif
372 or 0,0,8 /* or them to get word to store */
373 lwz 7,12(5)
374 stw 0,4(4) /* store the 2nd dst word */
375 addi 4,4,8
376 addi 5,5,16
377 bf 31,4f
378 /* there is a third word to copy, so copy it */
379#ifdef __LITTLE_ENDIAN__
380 srw 0,6,10
381 slw 8,7,9
382#else
383 slw 0,6,10 /* shift 3rd src word to left align it in R0 */
384 srw 8,7,9 /* shift 4th src word to right align it in R8 */
385#endif
386 or 0,0,8 /* or them to get word to store */
387 stw 0,0(4) /* store 3rd dst word */
388 mr 6,7
389 lwz 7,0(5)
390 addi 5,5,4
391 addi 4,4,4
392 b 4f
393 .align 4
3941:
395#ifdef __LITTLE_ENDIAN__
396 srw 0,6,10
397 slw 8,7,9
398#else
399 slw 0,6,10 /* shift 1st src word to left align it in R0 */
400 srw 8,7,9 /* shift 2nd src word to right align it in R8 */
401#endif
402 addi 5,5,8
403 or 0,0,8 /* or them to get word to store */
404 bf 31,4f
405 mr 6,7
406 lwz 7,0(5)
407 addi 5,5,4
408 stw 0,0(4) /* store the 1st dst word */
409 addi 4,4,4
410
411 .align 4
4124:
413 /* copy 16 bytes at a time */
414#ifdef __LITTLE_ENDIAN__
415 srw 0,6,10
416 slw 8,7,9
417#else
418 slw 0,6,10
419 srw 8,7,9
420#endif
421 or 0,0,8
422 lwz 6,0(5)
423 stw 0,0(4)
424#ifdef __LITTLE_ENDIAN__
425 srw 0,7,10
426 slw 8,6,9
427#else
428 slw 0,7,10
429 srw 8,6,9
430#endif
431 or 0,0,8
432 lwz 7,4(5)
433 stw 0,4(4)
434#ifdef __LITTLE_ENDIAN__
435 srw 0,6,10
436 slw 8,7,9
437#else
438 slw 0,6,10
439 srw 8,7,9
440#endif
441 or 0,0,8
442 lwz 6,8(5)
443 stw 0,8(4)
444#ifdef __LITTLE_ENDIAN__
445 srw 0,7,10
446 slw 8,6,9
447#else
448 slw 0,7,10
449 srw 8,6,9
450#endif
451 or 0,0,8
452 lwz 7,12(5)
453 stw 0,12(4)
454 addi 5,5,16
455 addi 4,4,16
456 bdnz+ 4b
4578:
458 /* calculate and store the final word */
459#ifdef __LITTLE_ENDIAN__
460 srw 0,6,10
461 slw 8,7,9
462#else
463 slw 0,6,10
464 srw 8,7,9
465#endif
466 or 0,0,8
467 stw 0,0(4)
4683:
469 clrrwi 0,31,2
470 mtcrf 0x01,31
471 bne cr6,.L9 /* If the tail is 0 bytes we are done! */
472
473 /* Return original dst pointer. */
474 mr 3,30
475 lwz 30,20(1)
476 lwz 31,24(1)
477 addi 1,1,32
478 blr
479END (memcpy)
480
481libc_hidden_builtin_def (memcpy)
482

source code of glibc/sysdeps/powerpc/powerpc32/power4/memcpy.S