1 | /* Optimized memcpy implementation for PowerPC A2. |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include <rtld-global-offsets.h> |
21 | |
22 | #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */ |
23 | #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */ |
24 | |
25 | .machine a2 |
26 | EALIGN (memcpy, 5, 0) |
27 | CALL_MCOUNT |
28 | |
29 | dcbt 0,r4 /* Prefetch ONE SRC cacheline */ |
30 | cmplwi cr1,r5,16 /* is size < 16 ? */ |
31 | mr r6,r3 /* Copy dest reg to r6; */ |
32 | blt+ cr1,L(shortcopy) |
33 | |
34 | |
35 | /* Big copy (16 bytes or more) |
36 | |
37 | Figure out how far to the nearest quadword boundary, or if we are |
38 | on one already. |
39 | |
40 | r3 - return value (always) |
41 | r4 - current source addr |
42 | r5 - copy length |
43 | r6 - current dest addr |
44 | */ |
45 | |
46 | neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */ |
47 | clrlwi r8,r8,32-4 /* align to 16byte boundary */ |
48 | sub r7,r4,r3 /* compute offset to src from dest */ |
49 | cmplwi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */ |
50 | beq+ L(dst_aligned) |
51 | |
52 | |
53 | |
54 | /* Destination is not aligned on quadword boundary. Get us to one. |
55 | |
56 | r3 - return value (always) |
57 | r4 - current source addr |
58 | r5 - copy length |
59 | r6 - current dest addr |
60 | r7 - offset to src from dest |
61 | r8 - number of bytes to quadword boundary |
62 | */ |
63 | |
64 | mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ |
65 | subf r5,r8,r5 /* adjust remaining len */ |
66 | |
67 | bf cr7*4+3,1f |
68 | lbzx r0,r7,r6 /* copy 1 byte addr */ |
69 | stb r0,0(r6) |
70 | addi r6,r6,1 |
71 | 1: |
72 | bf cr7*4+2,2f |
73 | lhzx r0,r7,r6 /* copy 2 byte addr */ |
74 | sth r0,0(r6) |
75 | addi r6,r6,2 |
76 | 2: |
77 | bf cr7*4+1,4f |
78 | lwzx r0,r7,r6 /* copy 4 byte addr */ |
79 | stw r0,0(r6) |
80 | addi r6,r6,4 |
81 | 4: |
82 | bf cr7*4+0,8f |
83 | lfdx r0,r7,r6 /* copy 8 byte addr */ |
84 | stfd r0,0(r6) |
85 | addi r6,r6,8 |
86 | 8: |
87 | add r4,r7,r6 /* update src addr */ |
88 | |
89 | |
90 | |
91 | /* Dest is quadword aligned now. |
92 | |
93 | Lots of decisions to make. If we are copying less than a cache |
94 | line we won't be here long. If we are not on a cache line |
95 | boundary we need to get there. And then we need to figure out |
96 | how many cache lines ahead to pre-touch. |
97 | |
98 | r3 - return value (always) |
99 | r4 - current source addr |
100 | r5 - copy length |
101 | r6 - current dest addr |
102 | */ |
103 | |
104 | |
105 | .align 4 |
106 | L(dst_aligned): |
107 | |
108 | |
109 | #ifdef PIC |
110 | mflr r0 |
111 | /* Establishes GOT addressability so we can load the cache line size |
112 | from rtld_global_ro. This value was set from the aux vector during |
113 | startup. */ |
114 | SETUP_GOT_ACCESS(r9,got_label) |
115 | addis r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@ha |
116 | addi r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@l |
117 | mtlr r0 |
118 | #endif |
119 | __GLRO(r9, r9, _dl_cache_line_size, |
120 | RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET) |
121 | |
122 | cmplwi cr5, r9, 0 |
123 | bne+ cr5,L(cachelineset) |
124 | |
125 | /* Cache line size not set: generic byte copy without much optimization */ |
126 | andi. r0,r5,1 /* If length is odd copy one byte. */ |
127 | beq L(cachelinenotset_align) |
128 | lbz r7,0(r4) /* Read one byte from source. */ |
129 | addi r5,r5,-1 /* Update length. */ |
130 | addi r4,r4,1 /* Update source pointer address. */ |
131 | stb r7,0(r6) /* Store one byte on dest. */ |
132 | addi r6,r6,1 /* Update dest pointer address. */ |
133 | L(cachelinenotset_align): |
134 | cmpwi cr7,r5,0 /* If length is 0 return. */ |
135 | beqlr cr7 |
136 | ori r2,r2,0 /* Force a new dispatch group. */ |
137 | L(cachelinenotset_loop): |
138 | addic. r5,r5,-2 /* Update length. */ |
139 | lbz r7,0(r4) /* Load 2 bytes from source. */ |
140 | lbz r8,1(r4) |
141 | addi r4,r4,2 /* Update source pointer address. */ |
142 | stb r7,0(r6) /* Store 2 bytes on dest. */ |
143 | stb r8,1(r6) |
144 | addi r6,r6,2 /* Update dest pointer address. */ |
145 | bne L(cachelinenotset_loop) |
146 | blr |
147 | |
148 | |
149 | L(cachelineset): |
150 | |
151 | addi r10,r9,-1 |
152 | |
153 | cmpw cr5,r5,r10 /* Less than a cacheline to go? */ |
154 | |
155 | neg r7,r6 /* How far to next cacheline bdy? */ |
156 | |
157 | addi r6,r6,-8 /* prepare for stdu */ |
158 | cmpwi cr0,r9,128 |
159 | addi r4,r4,-8 /* prepare for ldu */ |
160 | |
161 | |
162 | ble+ cr5,L(lessthancacheline) |
163 | |
164 | beq- cr0,L(big_lines) /* 128 byte line code */ |
165 | |
166 | |
167 | |
168 | |
169 | /* More than a cacheline left to go, and using 64 byte cachelines */ |
170 | |
171 | clrlwi r7,r7,32-6 /* How far to next cacheline bdy? */ |
172 | |
173 | cmplwi cr6,r7,0 /* Are we on a cacheline bdy already? */ |
174 | |
175 | /* Reduce total len by what it takes to get to the next cache line */ |
176 | subf r5,r7,r5 |
177 | srwi r7,r7,4 /* How many qws to get to the line bdy? */ |
178 | |
179 | /* How many full cache lines to copy after getting to a line bdy? */ |
180 | srwi r10,r5,6 |
181 | |
182 | cmplwi r10,0 /* If no full cache lines to copy ... */ |
183 | li r11,0 /* number cachelines to copy with prefetch */ |
184 | beq L(nocacheprefetch) |
185 | |
186 | |
187 | /* We are here because we have at least one full cache line to copy, |
188 | and therefore some pre-touching to do. */ |
189 | |
190 | cmplwi r10,PREFETCH_AHEAD |
191 | li r12,64+8 /* prefetch distance */ |
192 | ble L(lessthanmaxprefetch) |
193 | |
194 | /* We can only do so much pre-fetching. R11 will have the count of |
195 | lines left to prefetch after the initial batch of prefetches |
196 | are executed. */ |
197 | |
198 | subi r11,r10,PREFETCH_AHEAD |
199 | li r10,PREFETCH_AHEAD |
200 | |
201 | L(lessthanmaxprefetch): |
202 | mtctr r10 |
203 | |
204 | /* At this point r10/ctr hold the number of lines to prefetch in this |
205 | initial batch, and r11 holds any remainder. */ |
206 | |
207 | L(prefetchSRC): |
208 | dcbt r12,r4 |
209 | addi r12,r12,64 |
210 | bdnz L(prefetchSRC) |
211 | |
212 | |
213 | /* Prefetching is done, or was not needed. |
214 | |
215 | cr6 - are we on a cacheline boundary already? |
216 | r7 - number of quadwords to the next cacheline boundary |
217 | */ |
218 | |
219 | L(nocacheprefetch): |
220 | mtctr r7 |
221 | |
222 | cmplwi cr1,r5,64 /* Less than a cache line to copy? */ |
223 | |
224 | /* How many bytes are left after we copy whatever full |
225 | cache lines we can get? */ |
226 | clrlwi r5,r5,32-6 |
227 | |
228 | beq cr6,L(cachelinealigned) |
229 | |
230 | |
231 | /* Copy quadwords up to the next cacheline boundary */ |
232 | |
233 | L(aligntocacheline): |
234 | lfd fp9,0x08(r4) |
235 | lfdu fp10,0x10(r4) |
236 | stfd fp9,0x08(r6) |
237 | stfdu fp10,0x10(r6) |
238 | bdnz L(aligntocacheline) |
239 | |
240 | |
241 | .align 4 |
242 | L(cachelinealigned): /* copy while cache lines */ |
243 | |
244 | blt- cr1,L(lessthancacheline) /* size <64 */ |
245 | |
246 | L(outerloop): |
247 | cmpwi r11,0 |
248 | mtctr r11 |
249 | beq- L(endloop) |
250 | |
251 | li r11,64*ZERO_AHEAD +8 /* DCBZ dist */ |
252 | |
253 | .align 4 |
254 | /* Copy whole cachelines, optimized by prefetching SRC cacheline */ |
255 | L(loop): /* Copy aligned body */ |
256 | dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ |
257 | lfd fp9, 0x08(r4) |
258 | dcbz r11,r6 |
259 | lfd fp10, 0x10(r4) |
260 | lfd fp11, 0x18(r4) |
261 | lfd fp12, 0x20(r4) |
262 | stfd fp9, 0x08(r6) |
263 | stfd fp10, 0x10(r6) |
264 | stfd fp11, 0x18(r6) |
265 | stfd fp12, 0x20(r6) |
266 | lfd fp9, 0x28(r4) |
267 | lfd fp10, 0x30(r4) |
268 | lfd fp11, 0x38(r4) |
269 | lfdu fp12, 0x40(r4) |
270 | stfd fp9, 0x28(r6) |
271 | stfd fp10, 0x30(r6) |
272 | stfd fp11, 0x38(r6) |
273 | stfdu fp12, 0x40(r6) |
274 | |
275 | bdnz L(loop) |
276 | |
277 | |
278 | L(endloop): |
279 | cmpwi r10,0 |
280 | beq- L(endloop2) |
281 | mtctr r10 |
282 | |
283 | L(loop2): /* Copy aligned body */ |
284 | lfd fp9, 0x08(r4) |
285 | lfd fp10, 0x10(r4) |
286 | lfd fp11, 0x18(r4) |
287 | lfd fp12, 0x20(r4) |
288 | stfd fp9, 0x08(r6) |
289 | stfd fp10, 0x10(r6) |
290 | stfd fp11, 0x18(r6) |
291 | stfd fp12, 0x20(r6) |
292 | lfd fp9, 0x28(r4) |
293 | lfd fp10, 0x30(r4) |
294 | lfd fp11, 0x38(r4) |
295 | lfdu fp12, 0x40(r4) |
296 | stfd fp9, 0x28(r6) |
297 | stfd fp10, 0x30(r6) |
298 | stfd fp11, 0x38(r6) |
299 | stfdu fp12, 0x40(r6) |
300 | |
301 | bdnz L(loop2) |
302 | L(endloop2): |
303 | |
304 | |
305 | .align 4 |
306 | L(lessthancacheline): /* Was there less than cache to do ? */ |
307 | cmplwi cr0,r5,16 |
308 | srwi r7,r5,4 /* divide size by 16 */ |
309 | blt- L(do_lt16) |
310 | mtctr r7 |
311 | |
312 | L(copy_remaining): |
313 | lfd fp9, 0x08(r4) |
314 | lfdu fp10, 0x10(r4) |
315 | stfd fp9, 0x08(r6) |
316 | stfdu fp10, 0x10(r6) |
317 | bdnz L(copy_remaining) |
318 | |
319 | L(do_lt16): /* less than 16 ? */ |
320 | cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */ |
321 | beqlr+ /* no rest to copy */ |
322 | addi r4,r4,8 |
323 | addi r6,r6,8 |
324 | |
325 | L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */ |
326 | mtcrf 0x01,r5 |
327 | sub r7,r4,r6 |
328 | bf- cr7*4+0,8f |
329 | lfdx fp9,r7,r6 /* copy 8 byte */ |
330 | stfd fp9,0(r6) |
331 | addi r6,r6,8 |
332 | 8: |
333 | bf cr7*4+1,4f |
334 | lwzx r0,r7,r6 /* copy 4 byte */ |
335 | stw r0,0(r6) |
336 | addi r6,r6,4 |
337 | 4: |
338 | bf cr7*4+2,2f |
339 | lhzx r0,r7,r6 /* copy 2 byte */ |
340 | sth r0,0(r6) |
341 | addi r6,r6,2 |
342 | 2: |
343 | bf cr7*4+3,1f |
344 | lbzx r0,r7,r6 /* copy 1 byte */ |
345 | stb r0,0(r6) |
346 | 1: |
347 | blr |
348 | |
349 | |
350 | |
351 | |
352 | |
353 | /* Similar to above, but for use with 128 byte lines. */ |
354 | |
355 | |
356 | L(big_lines): |
357 | |
358 | clrlwi r7,r7,32-7 /* How far to next cacheline bdy? */ |
359 | |
360 | cmplwi cr6,r7,0 /* Are we on a cacheline bdy already? */ |
361 | |
362 | /* Reduce total len by what it takes to get to the next cache line */ |
363 | subf r5,r7,r5 |
364 | srwi r7,r7,4 /* How many qw to get to the line bdy? */ |
365 | |
366 | /* How many full cache lines to copy after getting to a line bdy? */ |
367 | srwi r10,r5,7 |
368 | |
369 | cmplwi r10,0 /* If no full cache lines to copy ... */ |
370 | li r11,0 /* number cachelines to copy with prefetch */ |
371 | beq L(nocacheprefetch_128) |
372 | |
373 | |
374 | /* We are here because we have at least one full cache line to copy, |
375 | and therefore some pre-touching to do. */ |
376 | |
377 | cmplwi r10,PREFETCH_AHEAD |
378 | li r12,128+8 /* prefetch distance */ |
379 | ble L(lessthanmaxprefetch_128) |
380 | |
381 | /* We can only do so much pre-fetching. R11 will have the count of |
382 | lines left to prefetch after the initial batch of prefetches |
383 | are executed. */ |
384 | |
385 | subi r11,r10,PREFETCH_AHEAD |
386 | li r10,PREFETCH_AHEAD |
387 | |
388 | L(lessthanmaxprefetch_128): |
389 | mtctr r10 |
390 | |
391 | /* At this point r10/ctr hold the number of lines to prefetch in this |
392 | initial batch, and r11 holds any remainder. */ |
393 | |
394 | L(prefetchSRC_128): |
395 | dcbt r12,r4 |
396 | addi r12,r12,128 |
397 | bdnz L(prefetchSRC_128) |
398 | |
399 | |
400 | /* Prefetching is done, or was not needed. |
401 | |
402 | cr6 - are we on a cacheline boundary already? |
403 | r7 - number of quadwords to the next cacheline boundary |
404 | */ |
405 | |
406 | L(nocacheprefetch_128): |
407 | mtctr r7 |
408 | |
409 | cmplwi cr1,r5,128 /* Less than a cache line to copy? */ |
410 | |
411 | /* How many bytes are left after we copy whatever full |
412 | cache lines we can get? */ |
413 | clrlwi r5,r5,32-7 |
414 | |
415 | beq cr6,L(cachelinealigned_128) |
416 | |
417 | |
418 | /* Copy quadwords up to the next cacheline boundary */ |
419 | |
420 | L(aligntocacheline_128): |
421 | lfd fp9,0x08(r4) |
422 | lfdu fp10,0x10(r4) |
423 | stfd fp9,0x08(r6) |
424 | stfdu fp10,0x10(r6) |
425 | bdnz L(aligntocacheline_128) |
426 | |
427 | |
428 | L(cachelinealigned_128): /* copy while cache lines */ |
429 | |
430 | blt- cr1,L(lessthancacheline) /* size <128 */ |
431 | |
432 | L(outerloop_128): |
433 | cmpwi r11,0 |
434 | mtctr r11 |
435 | beq- L(endloop_128) |
436 | |
437 | li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ |
438 | |
439 | .align 4 |
440 | /* Copy whole cachelines, optimized by prefetching SRC cacheline */ |
441 | L(loop_128): /* Copy aligned body */ |
442 | dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ |
443 | lfd fp9, 0x08(r4) |
444 | dcbz r11,r6 |
445 | lfd fp10, 0x10(r4) |
446 | lfd fp11, 0x18(r4) |
447 | lfd fp12, 0x20(r4) |
448 | stfd fp9, 0x08(r6) |
449 | stfd fp10, 0x10(r6) |
450 | stfd fp11, 0x18(r6) |
451 | stfd fp12, 0x20(r6) |
452 | lfd fp9, 0x28(r4) |
453 | lfd fp10, 0x30(r4) |
454 | lfd fp11, 0x38(r4) |
455 | lfd fp12, 0x40(r4) |
456 | stfd fp9, 0x28(r6) |
457 | stfd fp10, 0x30(r6) |
458 | stfd fp11, 0x38(r6) |
459 | stfd fp12, 0x40(r6) |
460 | lfd fp9, 0x48(r4) |
461 | lfd fp10, 0x50(r4) |
462 | lfd fp11, 0x58(r4) |
463 | lfd fp12, 0x60(r4) |
464 | stfd fp9, 0x48(r6) |
465 | stfd fp10, 0x50(r6) |
466 | stfd fp11, 0x58(r6) |
467 | stfd fp12, 0x60(r6) |
468 | lfd fp9, 0x68(r4) |
469 | lfd fp10, 0x70(r4) |
470 | lfd fp11, 0x78(r4) |
471 | lfdu fp12, 0x80(r4) |
472 | stfd fp9, 0x68(r6) |
473 | stfd fp10, 0x70(r6) |
474 | stfd fp11, 0x78(r6) |
475 | stfdu fp12, 0x80(r6) |
476 | |
477 | bdnz L(loop_128) |
478 | |
479 | |
480 | L(endloop_128): |
481 | cmpwi r10,0 |
482 | beq- L(endloop2_128) |
483 | mtctr r10 |
484 | |
485 | L(loop2_128): /* Copy aligned body */ |
486 | lfd fp9, 0x08(r4) |
487 | lfd fp10, 0x10(r4) |
488 | lfd fp11, 0x18(r4) |
489 | lfd fp12, 0x20(r4) |
490 | stfd fp9, 0x08(r6) |
491 | stfd fp10, 0x10(r6) |
492 | stfd fp11, 0x18(r6) |
493 | stfd fp12, 0x20(r6) |
494 | lfd fp9, 0x28(r4) |
495 | lfd fp10, 0x30(r4) |
496 | lfd fp11, 0x38(r4) |
497 | lfd fp12, 0x40(r4) |
498 | stfd fp9, 0x28(r6) |
499 | stfd fp10, 0x30(r6) |
500 | stfd fp11, 0x38(r6) |
501 | stfd fp12, 0x40(r6) |
502 | lfd fp9, 0x48(r4) |
503 | lfd fp10, 0x50(r4) |
504 | lfd fp11, 0x58(r4) |
505 | lfd fp12, 0x60(r4) |
506 | stfd fp9, 0x48(r6) |
507 | stfd fp10, 0x50(r6) |
508 | stfd fp11, 0x58(r6) |
509 | stfd fp12, 0x60(r6) |
510 | lfd fp9, 0x68(r4) |
511 | lfd fp10, 0x70(r4) |
512 | lfd fp11, 0x78(r4) |
513 | lfdu fp12, 0x80(r4) |
514 | stfd fp9, 0x68(r6) |
515 | stfd fp10, 0x70(r6) |
516 | stfd fp11, 0x78(r6) |
517 | stfdu fp12, 0x80(r6) |
518 | bdnz L(loop2_128) |
519 | L(endloop2_128): |
520 | |
521 | b L(lessthancacheline) |
522 | |
523 | |
524 | END (memcpy) |
525 | libc_hidden_builtin_def (memcpy) |
526 | |