1 | /* Optimized memcpy implementation for PowerPC A2. |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include <rtld-global-offsets.h> |
21 | |
22 | #ifndef MEMCPY |
23 | # define MEMCPY memcpy |
24 | #endif |
25 | |
26 | #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */ |
27 | #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */ |
28 | |
29 | .section ".toc" ,"aw" |
30 | __GLRO_DEF(dl_cache_line_size) |
31 | |
32 | |
33 | .section ".text" |
34 | .align 2 |
35 | |
36 | |
37 | .machine a2 |
38 | ENTRY (MEMCPY, 5) |
39 | CALL_MCOUNT 3 |
40 | |
41 | dcbt 0,r4 /* Prefetch ONE SRC cacheline */ |
42 | cmpldi cr1,r5,16 /* is size < 16 ? */ |
43 | mr r6,r3 /* Copy dest reg to r6; */ |
44 | blt+ cr1,L(shortcopy) |
45 | |
46 | |
47 | /* Big copy (16 bytes or more) |
48 | |
49 | Figure out how far to the nearest quadword boundary, or if we are |
50 | on one already. Also get the cache line size. |
51 | |
52 | r3 - return value (always) |
53 | r4 - current source addr |
54 | r5 - copy length |
55 | r6 - current dest addr |
56 | */ |
57 | |
58 | neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */ |
59 | /* Get the cache line size. */ |
60 | __GLRO (r9, dl_cache_line_size, |
61 | RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET) |
62 | clrldi r8,r8,64-4 /* align to 16byte boundary */ |
63 | sub r7,r4,r3 /* compute offset to src from dest */ |
64 | cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */ |
65 | addi r10,r9,-1 /* Cache line mask */ |
66 | beq+ L(dst_aligned) |
67 | |
68 | |
69 | |
70 | /* Destination is not aligned on quadword boundary. Get us to one. |
71 | |
72 | r3 - return value (always) |
73 | r4 - current source addr |
74 | r5 - copy length |
75 | r6 - current dest addr |
76 | r7 - offset to src from dest |
77 | r8 - number of bytes to quadword boundary |
78 | */ |
79 | |
80 | mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ |
81 | subf r5,r8,r5 /* adjust remaining len */ |
82 | |
83 | bf cr7*4+3,1f |
84 | lbzx r0,r7,r6 /* copy 1 byte addr */ |
85 | stb r0,0(r6) |
86 | addi r6,r6,1 |
87 | 1: |
88 | bf cr7*4+2,2f |
89 | lhzx r0,r7,r6 /* copy 2 byte addr */ |
90 | sth r0,0(r6) |
91 | addi r6,r6,2 |
92 | 2: |
93 | bf cr7*4+1,4f |
94 | lwzx r0,r7,r6 /* copy 4 byte addr */ |
95 | stw r0,0(r6) |
96 | addi r6,r6,4 |
97 | 4: |
98 | bf cr7*4+0,8f |
99 | ldx r0,r7,r6 /* copy 8 byte addr */ |
100 | std r0,0(r6) |
101 | addi r6,r6,8 |
102 | 8: |
103 | add r4,r7,r6 /* update src addr */ |
104 | |
105 | |
106 | |
107 | /* Dest is quadword aligned now. |
108 | |
109 | Lots of decisions to make. If we are copying less than a cache |
110 | line we won't be here long. If we are not on a cache line |
111 | boundary we need to get there. And then we need to figure out |
112 | how many cache lines ahead to pre-touch. |
113 | |
114 | r3 - return value (always) |
115 | r4 - current source addr |
116 | r5 - copy length |
117 | r6 - current dest addr |
118 | */ |
119 | |
120 | |
121 | .align 4 |
122 | L(dst_aligned): |
123 | cmpdi cr0,r9,0 /* Cache line size set? */ |
124 | bne+ cr0,L(cachelineset) |
125 | |
126 | /* Cache line size not set: generic byte copy without much optimization */ |
127 | clrldi. r0,r5,63 /* If length is odd copy one byte */ |
128 | beq L(cachelinenotset_align) |
129 | lbz r7,0(r4) /* Read one byte from source */ |
130 | addi r5,r5,-1 /* Update length */ |
131 | addi r4,r4,1 /* Update source pointer address */ |
132 | stb r7,0(r6) /* Store one byte at dest */ |
133 | addi r6,r6,1 /* Update dest pointer address */ |
134 | L(cachelinenotset_align): |
135 | cmpdi cr7,r5,0 /* If length is 0 return */ |
136 | beqlr cr7 |
137 | ori r2,r2,0 /* Force a new dispatch group */ |
138 | L(cachelinenotset_loop): |
139 | addic. r5,r5,-2 /* Update length */ |
140 | lbz r7,0(r4) /* Load 2 bytes from source */ |
141 | lbz r8,1(r4) |
142 | addi r4,r4,2 /* Update source pointer address */ |
143 | stb r7,0(r6) /* Store 2 bytes on dest */ |
144 | stb r8,1(r6) |
145 | addi r6,r6,2 /* Update dest pointer address */ |
146 | bne L(cachelinenotset_loop) |
147 | blr |
148 | |
149 | |
150 | L(cachelineset): |
151 | cmpd cr5,r5,r10 /* Less than a cacheline to go? */ |
152 | |
153 | neg r7,r6 /* How far to next cacheline bdy? */ |
154 | |
155 | addi r6,r6,-8 /* prepare for stdu */ |
156 | cmpdi cr0,r9,128 |
157 | addi r4,r4,-8 /* prepare for ldu */ |
158 | |
159 | |
160 | ble+ cr5,L(lessthancacheline) |
161 | |
162 | beq- cr0,L(big_lines) /* 128 byte line code */ |
163 | |
164 | |
165 | |
166 | /* More than a cacheline left to go, and using 64 byte cachelines */ |
167 | |
168 | clrldi r7,r7,64-6 /* How far to next cacheline bdy? */ |
169 | |
170 | cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */ |
171 | |
172 | /* Reduce total len by what it takes to get to the next cache line */ |
173 | subf r5,r7,r5 |
174 | srdi r7,r7,4 /* How many qws to get to the line bdy? */ |
175 | |
176 | /* How many full cache lines to copy after getting to a line bdy? */ |
177 | srdi r10,r5,6 |
178 | |
179 | cmpldi r10,0 /* If no full cache lines to copy ... */ |
180 | li r11,0 /* number cachelines to copy with prefetch */ |
181 | beq L(nocacheprefetch) |
182 | |
183 | |
184 | /* We are here because we have at least one full cache line to copy, |
185 | and therefore some pre-touching to do. */ |
186 | |
187 | cmpldi r10,PREFETCH_AHEAD |
188 | li r12,64+8 /* prefetch distance */ |
189 | ble L(lessthanmaxprefetch) |
190 | |
191 | /* We can only do so much pre-fetching. R11 will have the count of |
192 | lines left to prefetch after the initial batch of prefetches |
193 | are executed. */ |
194 | |
195 | subi r11,r10,PREFETCH_AHEAD |
196 | li r10,PREFETCH_AHEAD |
197 | |
198 | L(lessthanmaxprefetch): |
199 | mtctr r10 |
200 | |
201 | /* At this point r10/ctr hold the number of lines to prefetch in this |
202 | initial batch, and r11 holds any remainder. */ |
203 | |
204 | L(prefetchSRC): |
205 | dcbt r12,r4 |
206 | addi r12,r12,64 |
207 | bdnz L(prefetchSRC) |
208 | |
209 | |
210 | /* Prefetching is done, or was not needed. |
211 | |
212 | cr6 - are we on a cacheline boundary already? |
213 | r7 - number of quadwords to the next cacheline boundary |
214 | */ |
215 | |
216 | L(nocacheprefetch): |
217 | mtctr r7 |
218 | |
219 | cmpldi cr1,r5,64 /* Less than a cache line to copy? */ |
220 | |
221 | /* How many bytes are left after we copy whatever full |
222 | cache lines we can get? */ |
223 | clrldi r5,r5,64-6 |
224 | |
225 | beq cr6,L(cachelinealigned) |
226 | |
227 | |
228 | /* Copy quadwords up to the next cacheline boundary */ |
229 | |
230 | L(aligntocacheline): |
231 | ld r9,0x08(r4) |
232 | ld r7,0x10(r4) |
233 | addi r4,r4,0x10 |
234 | std r9,0x08(r6) |
235 | stdu r7,0x10(r6) |
236 | bdnz L(aligntocacheline) |
237 | |
238 | |
239 | .align 4 |
240 | L(cachelinealigned): /* copy while cache lines */ |
241 | |
242 | blt- cr1,L(lessthancacheline) /* size <64 */ |
243 | |
244 | L(outerloop): |
245 | cmpdi r11,0 |
246 | mtctr r11 |
247 | beq- L(endloop) |
248 | |
249 | li r11,64*ZERO_AHEAD +8 /* DCBZ dist */ |
250 | |
251 | .align 4 |
252 | /* Copy whole cachelines, optimized by prefetching SRC cacheline */ |
253 | L(loop): /* Copy aligned body */ |
254 | dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ |
255 | ld r9, 0x08(r4) |
256 | dcbz r11,r6 |
257 | ld r7, 0x10(r4) |
258 | ld r8, 0x18(r4) |
259 | ld r0, 0x20(r4) |
260 | std r9, 0x08(r6) |
261 | std r7, 0x10(r6) |
262 | std r8, 0x18(r6) |
263 | std r0, 0x20(r6) |
264 | ld r9, 0x28(r4) |
265 | ld r7, 0x30(r4) |
266 | ld r8, 0x38(r4) |
267 | ld r0, 0x40(r4) |
268 | addi r4, r4,0x40 |
269 | std r9, 0x28(r6) |
270 | std r7, 0x30(r6) |
271 | std r8, 0x38(r6) |
272 | stdu r0, 0x40(r6) |
273 | |
274 | bdnz L(loop) |
275 | |
276 | |
277 | L(endloop): |
278 | cmpdi r10,0 |
279 | beq- L(endloop2) |
280 | mtctr r10 |
281 | |
282 | L(loop2): /* Copy aligned body */ |
283 | ld r9, 0x08(r4) |
284 | ld r7, 0x10(r4) |
285 | ld r8, 0x18(r4) |
286 | ld r0, 0x20(r4) |
287 | std r9, 0x08(r6) |
288 | std r7, 0x10(r6) |
289 | std r8, 0x18(r6) |
290 | std r0, 0x20(r6) |
291 | ld r9, 0x28(r4) |
292 | ld r7, 0x30(r4) |
293 | ld r8, 0x38(r4) |
294 | ld r0, 0x40(r4) |
295 | addi r4, r4,0x40 |
296 | std r9, 0x28(r6) |
297 | std r7, 0x30(r6) |
298 | std r8, 0x38(r6) |
299 | stdu r0, 0x40(r6) |
300 | |
301 | bdnz L(loop2) |
302 | L(endloop2): |
303 | |
304 | |
305 | .align 4 |
306 | L(lessthancacheline): /* Was there less than cache to do ? */ |
307 | cmpldi cr0,r5,16 |
308 | srdi r7,r5,4 /* divide size by 16 */ |
309 | blt- L(do_lt16) |
310 | mtctr r7 |
311 | |
312 | L(copy_remaining): |
313 | ld r8,0x08(r4) |
314 | ld r7,0x10(r4) |
315 | addi r4,r4,0x10 |
316 | std r8,0x08(r6) |
317 | stdu r7,0x10(r6) |
318 | bdnz L(copy_remaining) |
319 | |
320 | L(do_lt16): /* less than 16 ? */ |
321 | cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */ |
322 | beqlr+ /* no rest to copy */ |
323 | addi r4,r4,8 |
324 | addi r6,r6,8 |
325 | |
326 | L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */ |
327 | mtcrf 0x01,r5 |
328 | sub r7,r4,r6 |
329 | bf- cr7*4+0,8f |
330 | ldx r0,r7,r6 /* copy 8 byte */ |
331 | std r0,0(r6) |
332 | addi r6,r6,8 |
333 | 8: |
334 | bf cr7*4+1,4f |
335 | lwzx r0,r7,r6 /* copy 4 byte */ |
336 | stw r0,0(r6) |
337 | addi r6,r6,4 |
338 | 4: |
339 | bf cr7*4+2,2f |
340 | lhzx r0,r7,r6 /* copy 2 byte */ |
341 | sth r0,0(r6) |
342 | addi r6,r6,2 |
343 | 2: |
344 | bf cr7*4+3,1f |
345 | lbzx r0,r7,r6 /* copy 1 byte */ |
346 | stb r0,0(r6) |
347 | 1: |
348 | blr |
349 | |
350 | |
351 | |
352 | |
353 | |
354 | /* Similar to above, but for use with 128 byte lines. */ |
355 | |
356 | |
357 | L(big_lines): |
358 | |
359 | clrldi r7,r7,64-7 /* How far to next cacheline bdy? */ |
360 | |
361 | cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */ |
362 | |
363 | /* Reduce total len by what it takes to get to the next cache line */ |
364 | subf r5,r7,r5 |
365 | srdi r7,r7,4 /* How many qws to get to the line bdy? */ |
366 | |
367 | /* How many full cache lines to copy after getting to a line bdy? */ |
368 | srdi r10,r5,7 |
369 | |
370 | cmpldi r10,0 /* If no full cache lines to copy ... */ |
371 | li r11,0 /* number cachelines to copy with prefetch */ |
372 | beq L(nocacheprefetch_128) |
373 | |
374 | |
375 | /* We are here because we have at least one full cache line to copy, |
376 | and therefore some pre-touching to do. */ |
377 | |
378 | cmpldi r10,PREFETCH_AHEAD |
379 | li r12,128+8 /* prefetch distance */ |
380 | ble L(lessthanmaxprefetch_128) |
381 | |
382 | /* We can only do so much pre-fetching. R11 will have the count of |
383 | lines left to prefetch after the initial batch of prefetches |
384 | are executed. */ |
385 | |
386 | subi r11,r10,PREFETCH_AHEAD |
387 | li r10,PREFETCH_AHEAD |
388 | |
389 | L(lessthanmaxprefetch_128): |
390 | mtctr r10 |
391 | |
392 | /* At this point r10/ctr hold the number of lines to prefetch in this |
393 | initial batch, and r11 holds any remainder. */ |
394 | |
395 | L(prefetchSRC_128): |
396 | dcbt r12,r4 |
397 | addi r12,r12,128 |
398 | bdnz L(prefetchSRC_128) |
399 | |
400 | |
401 | /* Prefetching is done, or was not needed. |
402 | |
403 | cr6 - are we on a cacheline boundary already? |
404 | r7 - number of quadwords to the next cacheline boundary |
405 | */ |
406 | |
407 | L(nocacheprefetch_128): |
408 | mtctr r7 |
409 | |
410 | cmpldi cr1,r5,128 /* Less than a cache line to copy? */ |
411 | |
412 | /* How many bytes are left after we copy whatever full |
413 | cache lines we can get? */ |
414 | clrldi r5,r5,64-7 |
415 | |
416 | beq cr6,L(cachelinealigned_128) |
417 | |
418 | |
419 | /* Copy quadwords up to the next cacheline boundary */ |
420 | |
421 | L(aligntocacheline_128): |
422 | ld r9,0x08(r4) |
423 | ld r7,0x10(r4) |
424 | addi r4,r4,0x10 |
425 | std r9,0x08(r6) |
426 | stdu r7,0x10(r6) |
427 | bdnz L(aligntocacheline_128) |
428 | |
429 | |
430 | L(cachelinealigned_128): /* copy while cache lines */ |
431 | |
432 | blt- cr1,L(lessthancacheline) /* size <128 */ |
433 | |
434 | L(outerloop_128): |
435 | cmpdi r11,0 |
436 | mtctr r11 |
437 | beq- L(endloop_128) |
438 | |
439 | li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ |
440 | |
441 | .align 4 |
442 | /* Copy whole cachelines, optimized by prefetching SRC cacheline */ |
443 | L(loop_128): /* Copy aligned body */ |
444 | dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ |
445 | ld r9, 0x08(r4) |
446 | dcbz r11,r6 |
447 | ld r7, 0x10(r4) |
448 | ld r8, 0x18(r4) |
449 | ld r0, 0x20(r4) |
450 | std r9, 0x08(r6) |
451 | std r7, 0x10(r6) |
452 | std r8, 0x18(r6) |
453 | std r0, 0x20(r6) |
454 | ld r9, 0x28(r4) |
455 | ld r7, 0x30(r4) |
456 | ld r8, 0x38(r4) |
457 | ld r0, 0x40(r4) |
458 | std r9, 0x28(r6) |
459 | std r7, 0x30(r6) |
460 | std r8, 0x38(r6) |
461 | std r0, 0x40(r6) |
462 | ld r9, 0x48(r4) |
463 | ld r7, 0x50(r4) |
464 | ld r8, 0x58(r4) |
465 | ld r0, 0x60(r4) |
466 | std r9, 0x48(r6) |
467 | std r7, 0x50(r6) |
468 | std r8, 0x58(r6) |
469 | std r0, 0x60(r6) |
470 | ld r9, 0x68(r4) |
471 | ld r7, 0x70(r4) |
472 | ld r8, 0x78(r4) |
473 | ld r0, 0x80(r4) |
474 | addi r4, r4,0x80 |
475 | std r9, 0x68(r6) |
476 | std r7, 0x70(r6) |
477 | std r8, 0x78(r6) |
478 | stdu r0, 0x80(r6) |
479 | |
480 | bdnz L(loop_128) |
481 | |
482 | |
483 | L(endloop_128): |
484 | cmpdi r10,0 |
485 | beq- L(endloop2_128) |
486 | mtctr r10 |
487 | |
488 | L(loop2_128): /* Copy aligned body */ |
489 | ld r9, 0x08(r4) |
490 | ld r7, 0x10(r4) |
491 | ld r8, 0x18(r4) |
492 | ld r0, 0x20(r4) |
493 | std r9, 0x08(r6) |
494 | std r7, 0x10(r6) |
495 | std r8, 0x18(r6) |
496 | std r0, 0x20(r6) |
497 | ld r9, 0x28(r4) |
498 | ld r7, 0x30(r4) |
499 | ld r8, 0x38(r4) |
500 | ld r0, 0x40(r4) |
501 | std r9, 0x28(r6) |
502 | std r7, 0x30(r6) |
503 | std r8, 0x38(r6) |
504 | std r0, 0x40(r6) |
505 | ld r9, 0x48(r4) |
506 | ld r7, 0x50(r4) |
507 | ld r8, 0x58(r4) |
508 | ld r0, 0x60(r4) |
509 | std r9, 0x48(r6) |
510 | std r7, 0x50(r6) |
511 | std r8, 0x58(r6) |
512 | std r0, 0x60(r6) |
513 | ld r9, 0x68(r4) |
514 | ld r7, 0x70(r4) |
515 | ld r8, 0x78(r4) |
516 | ld r0, 0x80(r4) |
517 | addi r4, r4,0x80 |
518 | std r9, 0x68(r6) |
519 | std r7, 0x70(r6) |
520 | std r8, 0x78(r6) |
521 | stdu r0, 0x80(r6) |
522 | |
523 | bdnz L(loop2_128) |
524 | L(endloop2_128): |
525 | |
526 | b L(lessthancacheline) |
527 | |
528 | |
529 | END_GEN_TB (MEMCPY,TB_TOCLESS) |
530 | libc_hidden_builtin_def (memcpy) |
531 | |