1/* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include <rtld-global-offsets.h>
21
22#define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
23#define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
24
25 .machine a2
26EALIGN (memcpy, 5, 0)
27 CALL_MCOUNT
28
29 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
30 cmplwi cr1,r5,16 /* is size < 16 ? */
31 mr r6,r3 /* Copy dest reg to r6; */
32 blt+ cr1,L(shortcopy)
33
34
35 /* Big copy (16 bytes or more)
36
37 Figure out how far to the nearest quadword boundary, or if we are
38 on one already.
39
40 r3 - return value (always)
41 r4 - current source addr
42 r5 - copy length
43 r6 - current dest addr
44 */
45
46 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
47 clrlwi r8,r8,32-4 /* align to 16byte boundary */
48 sub r7,r4,r3 /* compute offset to src from dest */
49 cmplwi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
50 beq+ L(dst_aligned)
51
52
53
54 /* Destination is not aligned on quadword boundary. Get us to one.
55
56 r3 - return value (always)
57 r4 - current source addr
58 r5 - copy length
59 r6 - current dest addr
60 r7 - offset to src from dest
61 r8 - number of bytes to quadword boundary
62 */
63
64 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
65 subf r5,r8,r5 /* adjust remaining len */
66
67 bf cr7*4+3,1f
68 lbzx r0,r7,r6 /* copy 1 byte addr */
69 stb r0,0(r6)
70 addi r6,r6,1
711:
72 bf cr7*4+2,2f
73 lhzx r0,r7,r6 /* copy 2 byte addr */
74 sth r0,0(r6)
75 addi r6,r6,2
762:
77 bf cr7*4+1,4f
78 lwzx r0,r7,r6 /* copy 4 byte addr */
79 stw r0,0(r6)
80 addi r6,r6,4
814:
82 bf cr7*4+0,8f
83 lfdx r0,r7,r6 /* copy 8 byte addr */
84 stfd r0,0(r6)
85 addi r6,r6,8
868:
87 add r4,r7,r6 /* update src addr */
88
89
90
91 /* Dest is quadword aligned now.
92
93 Lots of decisions to make. If we are copying less than a cache
94 line we won't be here long. If we are not on a cache line
95 boundary we need to get there. And then we need to figure out
96 how many cache lines ahead to pre-touch.
97
98 r3 - return value (always)
99 r4 - current source addr
100 r5 - copy length
101 r6 - current dest addr
102 */
103
104
105 .align 4
106L(dst_aligned):
107
108
109#ifdef PIC
110 mflr r0
111/* Establishes GOT addressability so we can load the cache line size
112 from rtld_global_ro. This value was set from the aux vector during
113 startup. */
114 SETUP_GOT_ACCESS(r9,got_label)
115 addis r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@ha
116 addi r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@l
117 mtlr r0
118#endif
119 __GLRO(r9, r9, _dl_cache_line_size,
120 RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
121
122 cmplwi cr5, r9, 0
123 bne+ cr5,L(cachelineset)
124
125/* Cache line size not set: generic byte copy without much optimization */
126 andi. r0,r5,1 /* If length is odd copy one byte. */
127 beq L(cachelinenotset_align)
128 lbz r7,0(r4) /* Read one byte from source. */
129 addi r5,r5,-1 /* Update length. */
130 addi r4,r4,1 /* Update source pointer address. */
131 stb r7,0(r6) /* Store one byte on dest. */
132 addi r6,r6,1 /* Update dest pointer address. */
133L(cachelinenotset_align):
134 cmpwi cr7,r5,0 /* If length is 0 return. */
135 beqlr cr7
136 ori r2,r2,0 /* Force a new dispatch group. */
137L(cachelinenotset_loop):
138 addic. r5,r5,-2 /* Update length. */
139 lbz r7,0(r4) /* Load 2 bytes from source. */
140 lbz r8,1(r4)
141 addi r4,r4,2 /* Update source pointer address. */
142 stb r7,0(r6) /* Store 2 bytes on dest. */
143 stb r8,1(r6)
144 addi r6,r6,2 /* Update dest pointer address. */
145 bne L(cachelinenotset_loop)
146 blr
147
148
149L(cachelineset):
150
151 addi r10,r9,-1
152
153 cmpw cr5,r5,r10 /* Less than a cacheline to go? */
154
155 neg r7,r6 /* How far to next cacheline bdy? */
156
157 addi r6,r6,-8 /* prepare for stdu */
158 cmpwi cr0,r9,128
159 addi r4,r4,-8 /* prepare for ldu */
160
161
162 ble+ cr5,L(lessthancacheline)
163
164 beq- cr0,L(big_lines) /* 128 byte line code */
165
166
167
168
169 /* More than a cacheline left to go, and using 64 byte cachelines */
170
171 clrlwi r7,r7,32-6 /* How far to next cacheline bdy? */
172
173 cmplwi cr6,r7,0 /* Are we on a cacheline bdy already? */
174
175 /* Reduce total len by what it takes to get to the next cache line */
176 subf r5,r7,r5
177 srwi r7,r7,4 /* How many qws to get to the line bdy? */
178
179 /* How many full cache lines to copy after getting to a line bdy? */
180 srwi r10,r5,6
181
182 cmplwi r10,0 /* If no full cache lines to copy ... */
183 li r11,0 /* number cachelines to copy with prefetch */
184 beq L(nocacheprefetch)
185
186
187 /* We are here because we have at least one full cache line to copy,
188 and therefore some pre-touching to do. */
189
190 cmplwi r10,PREFETCH_AHEAD
191 li r12,64+8 /* prefetch distance */
192 ble L(lessthanmaxprefetch)
193
194 /* We can only do so much pre-fetching. R11 will have the count of
195 lines left to prefetch after the initial batch of prefetches
196 are executed. */
197
198 subi r11,r10,PREFETCH_AHEAD
199 li r10,PREFETCH_AHEAD
200
201L(lessthanmaxprefetch):
202 mtctr r10
203
204 /* At this point r10/ctr hold the number of lines to prefetch in this
205 initial batch, and r11 holds any remainder. */
206
207L(prefetchSRC):
208 dcbt r12,r4
209 addi r12,r12,64
210 bdnz L(prefetchSRC)
211
212
213 /* Prefetching is done, or was not needed.
214
215 cr6 - are we on a cacheline boundary already?
216 r7 - number of quadwords to the next cacheline boundary
217 */
218
219L(nocacheprefetch):
220 mtctr r7
221
222 cmplwi cr1,r5,64 /* Less than a cache line to copy? */
223
224 /* How many bytes are left after we copy whatever full
225 cache lines we can get? */
226 clrlwi r5,r5,32-6
227
228 beq cr6,L(cachelinealigned)
229
230
231 /* Copy quadwords up to the next cacheline boundary */
232
233L(aligntocacheline):
234 lfd fp9,0x08(r4)
235 lfdu fp10,0x10(r4)
236 stfd fp9,0x08(r6)
237 stfdu fp10,0x10(r6)
238 bdnz L(aligntocacheline)
239
240
241 .align 4
242L(cachelinealigned): /* copy while cache lines */
243
244 blt- cr1,L(lessthancacheline) /* size <64 */
245
246L(outerloop):
247 cmpwi r11,0
248 mtctr r11
249 beq- L(endloop)
250
251 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
252
253 .align 4
254 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
255L(loop): /* Copy aligned body */
256 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
257 lfd fp9, 0x08(r4)
258 dcbz r11,r6
259 lfd fp10, 0x10(r4)
260 lfd fp11, 0x18(r4)
261 lfd fp12, 0x20(r4)
262 stfd fp9, 0x08(r6)
263 stfd fp10, 0x10(r6)
264 stfd fp11, 0x18(r6)
265 stfd fp12, 0x20(r6)
266 lfd fp9, 0x28(r4)
267 lfd fp10, 0x30(r4)
268 lfd fp11, 0x38(r4)
269 lfdu fp12, 0x40(r4)
270 stfd fp9, 0x28(r6)
271 stfd fp10, 0x30(r6)
272 stfd fp11, 0x38(r6)
273 stfdu fp12, 0x40(r6)
274
275 bdnz L(loop)
276
277
278L(endloop):
279 cmpwi r10,0
280 beq- L(endloop2)
281 mtctr r10
282
283L(loop2): /* Copy aligned body */
284 lfd fp9, 0x08(r4)
285 lfd fp10, 0x10(r4)
286 lfd fp11, 0x18(r4)
287 lfd fp12, 0x20(r4)
288 stfd fp9, 0x08(r6)
289 stfd fp10, 0x10(r6)
290 stfd fp11, 0x18(r6)
291 stfd fp12, 0x20(r6)
292 lfd fp9, 0x28(r4)
293 lfd fp10, 0x30(r4)
294 lfd fp11, 0x38(r4)
295 lfdu fp12, 0x40(r4)
296 stfd fp9, 0x28(r6)
297 stfd fp10, 0x30(r6)
298 stfd fp11, 0x38(r6)
299 stfdu fp12, 0x40(r6)
300
301 bdnz L(loop2)
302L(endloop2):
303
304
305 .align 4
306L(lessthancacheline): /* Was there less than cache to do ? */
307 cmplwi cr0,r5,16
308 srwi r7,r5,4 /* divide size by 16 */
309 blt- L(do_lt16)
310 mtctr r7
311
312L(copy_remaining):
313 lfd fp9, 0x08(r4)
314 lfdu fp10, 0x10(r4)
315 stfd fp9, 0x08(r6)
316 stfdu fp10, 0x10(r6)
317 bdnz L(copy_remaining)
318
319L(do_lt16): /* less than 16 ? */
320 cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
321 beqlr+ /* no rest to copy */
322 addi r4,r4,8
323 addi r6,r6,8
324
325L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
326 mtcrf 0x01,r5
327 sub r7,r4,r6
328 bf- cr7*4+0,8f
329 lfdx fp9,r7,r6 /* copy 8 byte */
330 stfd fp9,0(r6)
331 addi r6,r6,8
3328:
333 bf cr7*4+1,4f
334 lwzx r0,r7,r6 /* copy 4 byte */
335 stw r0,0(r6)
336 addi r6,r6,4
3374:
338 bf cr7*4+2,2f
339 lhzx r0,r7,r6 /* copy 2 byte */
340 sth r0,0(r6)
341 addi r6,r6,2
3422:
343 bf cr7*4+3,1f
344 lbzx r0,r7,r6 /* copy 1 byte */
345 stb r0,0(r6)
3461:
347 blr
348
349
350
351
352
353 /* Similar to above, but for use with 128 byte lines. */
354
355
356L(big_lines):
357
358 clrlwi r7,r7,32-7 /* How far to next cacheline bdy? */
359
360 cmplwi cr6,r7,0 /* Are we on a cacheline bdy already? */
361
362 /* Reduce total len by what it takes to get to the next cache line */
363 subf r5,r7,r5
364 srwi r7,r7,4 /* How many qw to get to the line bdy? */
365
366 /* How many full cache lines to copy after getting to a line bdy? */
367 srwi r10,r5,7
368
369 cmplwi r10,0 /* If no full cache lines to copy ... */
370 li r11,0 /* number cachelines to copy with prefetch */
371 beq L(nocacheprefetch_128)
372
373
374 /* We are here because we have at least one full cache line to copy,
375 and therefore some pre-touching to do. */
376
377 cmplwi r10,PREFETCH_AHEAD
378 li r12,128+8 /* prefetch distance */
379 ble L(lessthanmaxprefetch_128)
380
381 /* We can only do so much pre-fetching. R11 will have the count of
382 lines left to prefetch after the initial batch of prefetches
383 are executed. */
384
385 subi r11,r10,PREFETCH_AHEAD
386 li r10,PREFETCH_AHEAD
387
388L(lessthanmaxprefetch_128):
389 mtctr r10
390
391 /* At this point r10/ctr hold the number of lines to prefetch in this
392 initial batch, and r11 holds any remainder. */
393
394L(prefetchSRC_128):
395 dcbt r12,r4
396 addi r12,r12,128
397 bdnz L(prefetchSRC_128)
398
399
400 /* Prefetching is done, or was not needed.
401
402 cr6 - are we on a cacheline boundary already?
403 r7 - number of quadwords to the next cacheline boundary
404 */
405
406L(nocacheprefetch_128):
407 mtctr r7
408
409 cmplwi cr1,r5,128 /* Less than a cache line to copy? */
410
411 /* How many bytes are left after we copy whatever full
412 cache lines we can get? */
413 clrlwi r5,r5,32-7
414
415 beq cr6,L(cachelinealigned_128)
416
417
418 /* Copy quadwords up to the next cacheline boundary */
419
420L(aligntocacheline_128):
421 lfd fp9,0x08(r4)
422 lfdu fp10,0x10(r4)
423 stfd fp9,0x08(r6)
424 stfdu fp10,0x10(r6)
425 bdnz L(aligntocacheline_128)
426
427
428L(cachelinealigned_128): /* copy while cache lines */
429
430 blt- cr1,L(lessthancacheline) /* size <128 */
431
432L(outerloop_128):
433 cmpwi r11,0
434 mtctr r11
435 beq- L(endloop_128)
436
437 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
438
439 .align 4
440 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
441L(loop_128): /* Copy aligned body */
442 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
443 lfd fp9, 0x08(r4)
444 dcbz r11,r6
445 lfd fp10, 0x10(r4)
446 lfd fp11, 0x18(r4)
447 lfd fp12, 0x20(r4)
448 stfd fp9, 0x08(r6)
449 stfd fp10, 0x10(r6)
450 stfd fp11, 0x18(r6)
451 stfd fp12, 0x20(r6)
452 lfd fp9, 0x28(r4)
453 lfd fp10, 0x30(r4)
454 lfd fp11, 0x38(r4)
455 lfd fp12, 0x40(r4)
456 stfd fp9, 0x28(r6)
457 stfd fp10, 0x30(r6)
458 stfd fp11, 0x38(r6)
459 stfd fp12, 0x40(r6)
460 lfd fp9, 0x48(r4)
461 lfd fp10, 0x50(r4)
462 lfd fp11, 0x58(r4)
463 lfd fp12, 0x60(r4)
464 stfd fp9, 0x48(r6)
465 stfd fp10, 0x50(r6)
466 stfd fp11, 0x58(r6)
467 stfd fp12, 0x60(r6)
468 lfd fp9, 0x68(r4)
469 lfd fp10, 0x70(r4)
470 lfd fp11, 0x78(r4)
471 lfdu fp12, 0x80(r4)
472 stfd fp9, 0x68(r6)
473 stfd fp10, 0x70(r6)
474 stfd fp11, 0x78(r6)
475 stfdu fp12, 0x80(r6)
476
477 bdnz L(loop_128)
478
479
480L(endloop_128):
481 cmpwi r10,0
482 beq- L(endloop2_128)
483 mtctr r10
484
485L(loop2_128): /* Copy aligned body */
486 lfd fp9, 0x08(r4)
487 lfd fp10, 0x10(r4)
488 lfd fp11, 0x18(r4)
489 lfd fp12, 0x20(r4)
490 stfd fp9, 0x08(r6)
491 stfd fp10, 0x10(r6)
492 stfd fp11, 0x18(r6)
493 stfd fp12, 0x20(r6)
494 lfd fp9, 0x28(r4)
495 lfd fp10, 0x30(r4)
496 lfd fp11, 0x38(r4)
497 lfd fp12, 0x40(r4)
498 stfd fp9, 0x28(r6)
499 stfd fp10, 0x30(r6)
500 stfd fp11, 0x38(r6)
501 stfd fp12, 0x40(r6)
502 lfd fp9, 0x48(r4)
503 lfd fp10, 0x50(r4)
504 lfd fp11, 0x58(r4)
505 lfd fp12, 0x60(r4)
506 stfd fp9, 0x48(r6)
507 stfd fp10, 0x50(r6)
508 stfd fp11, 0x58(r6)
509 stfd fp12, 0x60(r6)
510 lfd fp9, 0x68(r4)
511 lfd fp10, 0x70(r4)
512 lfd fp11, 0x78(r4)
513 lfdu fp12, 0x80(r4)
514 stfd fp9, 0x68(r6)
515 stfd fp10, 0x70(r6)
516 stfd fp11, 0x78(r6)
517 stfdu fp12, 0x80(r6)
518 bdnz L(loop2_128)
519L(endloop2_128):
520
521 b L(lessthancacheline)
522
523
524END (memcpy)
525libc_hidden_builtin_def (memcpy)
526

source code of glibc/sysdeps/powerpc/powerpc32/a2/memcpy.S