1 | /* Optimized memcpy implementation for PowerPC64. |
2 | Copyright (C) 2003-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); |
22 | Returns 'dst'. |
23 | |
24 | Memcpy handles short copies (< 32-bytes) using a binary move blocks |
25 | (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled |
26 | with the appropriate combination of byte and halfword load/stores. |
27 | There is minimal effort to optimize the alignment of short moves. |
28 | The 64-bit implementations of POWER3 and POWER4 do a reasonable job |
29 | of handling unaligned load/stores that do not cross 32-byte boundaries. |
30 | |
31 | Longer moves (>= 32-bytes) justify the effort to get at least the |
32 | destination doubleword (8-byte) aligned. Further optimization is |
33 | possible when both source and destination are doubleword aligned. |
34 | Each case has a optimized unrolled loop. */ |
35 | |
36 | #ifndef MEMCPY |
37 | # define MEMCPY memcpy |
38 | #endif |
39 | .machine power4 |
40 | ENTRY_TOCLESS (MEMCPY, 5) |
41 | CALL_MCOUNT 3 |
42 | |
43 | cmpldi cr1,5,31 |
44 | neg 0,3 |
45 | std 3,-16(1) |
46 | std 31,-8(1) |
47 | cfi_offset(31,-8) |
48 | andi. 11,3,7 /* check alignment of dst. */ |
49 | clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */ |
50 | clrldi 10,4,61 /* check alignment of src. */ |
51 | cmpldi cr6,5,8 |
52 | ble- cr1,.L2 /* If move < 32 bytes use short move code. */ |
53 | cmpld cr6,10,11 |
54 | mr 12,4 |
55 | srdi 9,5,3 /* Number of full double words remaining. */ |
56 | mtcrf 0x01,0 |
57 | mr 31,5 |
58 | beq .L0 |
59 | |
60 | subf 31,0,5 |
61 | /* Move 0-7 bytes as needed to get the destination doubleword aligned. */ |
62 | 1: bf 31,2f |
63 | lbz 6,0(12) |
64 | addi 12,12,1 |
65 | stb 6,0(3) |
66 | addi 3,3,1 |
67 | 2: bf 30,4f |
68 | lhz 6,0(12) |
69 | addi 12,12,2 |
70 | sth 6,0(3) |
71 | addi 3,3,2 |
72 | 4: bf 29,0f |
73 | lwz 6,0(12) |
74 | addi 12,12,4 |
75 | stw 6,0(3) |
76 | addi 3,3,4 |
77 | 0: |
78 | clrldi 10,12,61 /* check alignment of src again. */ |
79 | srdi 9,31,3 /* Number of full double words remaining. */ |
80 | |
81 | /* Copy doublewords from source to destination, assuming the |
82 | destination is aligned on a doubleword boundary. |
83 | |
84 | At this point we know there are at least 25 bytes left (32-7) to copy. |
85 | The next step is to determine if the source is also doubleword aligned. |
86 | If not branch to the unaligned move code at .L6. which uses |
87 | a load, shift, store strategy. |
88 | |
89 | Otherwise source and destination are doubleword aligned, and we can |
90 | the optimized doubleword copy loop. */ |
91 | .L0: |
92 | clrldi 11,31,61 |
93 | mtcrf 0x01,9 |
94 | cmpldi cr1,11,0 |
95 | bne- cr6,.L6 /* If source is not DW aligned. */ |
96 | |
97 | /* Move doublewords where destination and source are DW aligned. |
98 | Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration. |
99 | If the copy is not an exact multiple of 32 bytes, 1-3 |
100 | doublewords are copied as needed to set up the main loop. After |
101 | the main loop exits there may be a tail of 1-7 bytes. These byte are |
102 | copied a word/halfword/byte at a time as needed to preserve alignment. */ |
103 | |
104 | srdi 8,31,5 |
105 | cmpldi cr1,9,4 |
106 | cmpldi cr6,11,0 |
107 | mr 11,12 |
108 | |
109 | bf 30,1f |
110 | ld 6,0(12) |
111 | ld 7,8(12) |
112 | addi 11,12,16 |
113 | mtctr 8 |
114 | std 6,0(3) |
115 | std 7,8(3) |
116 | addi 10,3,16 |
117 | bf 31,4f |
118 | ld 0,16(12) |
119 | std 0,16(3) |
120 | blt cr1,3f |
121 | addi 11,12,24 |
122 | addi 10,3,24 |
123 | b 4f |
124 | .align 4 |
125 | 1: |
126 | mr 10,3 |
127 | mtctr 8 |
128 | bf 31,4f |
129 | ld 6,0(12) |
130 | addi 11,12,8 |
131 | std 6,0(3) |
132 | addi 10,3,8 |
133 | |
134 | .align 4 |
135 | 4: |
136 | ld 6,0(11) |
137 | ld 7,8(11) |
138 | ld 8,16(11) |
139 | ld 0,24(11) |
140 | addi 11,11,32 |
141 | 2: |
142 | std 6,0(10) |
143 | std 7,8(10) |
144 | std 8,16(10) |
145 | std 0,24(10) |
146 | addi 10,10,32 |
147 | bdnz 4b |
148 | 3: |
149 | |
150 | rldicr 0,31,0,60 |
151 | mtcrf 0x01,31 |
152 | beq cr6,0f |
153 | .L9: |
154 | add 3,3,0 |
155 | add 12,12,0 |
156 | |
157 | /* At this point we have a tail of 0-7 bytes and we know that the |
158 | destination is double word aligned. */ |
159 | 4: bf 29,2f |
160 | lwz 6,0(12) |
161 | addi 12,12,4 |
162 | stw 6,0(3) |
163 | addi 3,3,4 |
164 | 2: bf 30,1f |
165 | lhz 6,0(12) |
166 | addi 12,12,2 |
167 | sth 6,0(3) |
168 | addi 3,3,2 |
169 | 1: bf 31,0f |
170 | lbz 6,0(12) |
171 | stb 6,0(3) |
172 | 0: |
173 | /* Return original dst pointer. */ |
174 | ld 31,-8(1) |
175 | ld 3,-16(1) |
176 | blr |
177 | |
178 | /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 |
179 | bytes. Each case is handled without loops, using binary (1,2,4,8) |
180 | tests. |
181 | |
182 | In the short (0-8 byte) case no attempt is made to force alignment |
183 | of either source or destination. The hardware will handle the |
184 | unaligned load/stores with small delays for crossing 32- 64-byte, and |
185 | 4096-byte boundaries. Since these short moves are unlikely to be |
186 | unaligned or cross these boundaries, the overhead to force |
187 | alignment is not justified. |
188 | |
189 | The longer (9-31 byte) move is more likely to cross 32- or 64-byte |
190 | boundaries. Since only loads are sensitive to the 32-/64-byte |
191 | boundaries it is more important to align the source then the |
192 | destination. If the source is not already word aligned, we first |
193 | move 1-3 bytes as needed. Since we are only word aligned we don't |
194 | use double word load/stores to insure that all loads are aligned. |
195 | While the destination and stores may still be unaligned, this |
196 | is only an issue for page (4096 byte boundary) crossing, which |
197 | should be rare for these short moves. The hardware handles this |
198 | case automatically with a small delay. */ |
199 | |
200 | .align 4 |
201 | .L2: |
202 | mtcrf 0x01,5 |
203 | neg 8,4 |
204 | clrrdi 11,4,2 |
205 | andi. 0,8,3 |
206 | ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ |
207 | /* At least 9 bytes left. Get the source word aligned. */ |
208 | cmpldi cr1,5,16 |
209 | mr 10,5 |
210 | mr 12,4 |
211 | cmpldi cr6,0,2 |
212 | beq .L3 /* If the source is already word aligned skip this. */ |
213 | /* Copy 1-3 bytes to get source address word aligned. */ |
214 | lwz 6,0(11) |
215 | subf 10,0,5 |
216 | add 12,4,0 |
217 | blt cr6,5f |
218 | srdi 7,6,16 |
219 | bgt cr6,3f |
220 | #ifdef __LITTLE_ENDIAN__ |
221 | sth 7,0(3) |
222 | #else |
223 | sth 6,0(3) |
224 | #endif |
225 | b 7f |
226 | .align 4 |
227 | 3: |
228 | #ifdef __LITTLE_ENDIAN__ |
229 | rotlwi 6,6,24 |
230 | stb 6,0(3) |
231 | sth 7,1(3) |
232 | #else |
233 | stb 7,0(3) |
234 | sth 6,1(3) |
235 | #endif |
236 | b 7f |
237 | .align 4 |
238 | 5: |
239 | #ifdef __LITTLE_ENDIAN__ |
240 | rotlwi 6,6,8 |
241 | #endif |
242 | stb 6,0(3) |
243 | 7: |
244 | cmpldi cr1,10,16 |
245 | add 3,3,0 |
246 | mtcrf 0x01,10 |
247 | .align 4 |
248 | .L3: |
249 | /* At least 6 bytes left and the source is word aligned. */ |
250 | blt cr1,8f |
251 | 16: /* Move 16 bytes. */ |
252 | lwz 6,0(12) |
253 | lwz 7,4(12) |
254 | stw 6,0(3) |
255 | lwz 6,8(12) |
256 | stw 7,4(3) |
257 | lwz 7,12(12) |
258 | addi 12,12,16 |
259 | stw 6,8(3) |
260 | stw 7,12(3) |
261 | addi 3,3,16 |
262 | 8: /* Move 8 bytes. */ |
263 | bf 28,4f |
264 | lwz 6,0(12) |
265 | lwz 7,4(12) |
266 | addi 12,12,8 |
267 | stw 6,0(3) |
268 | stw 7,4(3) |
269 | addi 3,3,8 |
270 | 4: /* Move 4 bytes. */ |
271 | bf 29,2f |
272 | lwz 6,0(12) |
273 | addi 12,12,4 |
274 | stw 6,0(3) |
275 | addi 3,3,4 |
276 | 2: /* Move 2-3 bytes. */ |
277 | bf 30,1f |
278 | lhz 6,0(12) |
279 | sth 6,0(3) |
280 | bf 31,0f |
281 | lbz 7,2(12) |
282 | stb 7,2(3) |
283 | ld 3,-16(1) |
284 | blr |
285 | 1: /* Move 1 byte. */ |
286 | bf 31,0f |
287 | lbz 6,0(12) |
288 | stb 6,0(3) |
289 | 0: |
290 | /* Return original dst pointer. */ |
291 | ld 3,-16(1) |
292 | blr |
293 | |
294 | /* Special case to copy 0-8 bytes. */ |
295 | .align 4 |
296 | .LE8: |
297 | mr 12,4 |
298 | bne cr6,4f |
299 | /* Would have liked to use use ld/std here but the 630 processors are |
300 | slow for load/store doubles that are not at least word aligned. |
301 | Unaligned Load/Store word execute with only a 1 cycle penalty. */ |
302 | lwz 6,0(4) |
303 | lwz 7,4(4) |
304 | stw 6,0(3) |
305 | stw 7,4(3) |
306 | /* Return original dst pointer. */ |
307 | ld 3,-16(1) |
308 | blr |
309 | .align 4 |
310 | 4: bf 29,2b |
311 | lwz 6,0(4) |
312 | stw 6,0(3) |
313 | 6: |
314 | bf 30,5f |
315 | lhz 7,4(4) |
316 | sth 7,4(3) |
317 | bf 31,0f |
318 | lbz 8,6(4) |
319 | stb 8,6(3) |
320 | ld 3,-16(1) |
321 | blr |
322 | .align 4 |
323 | 5: |
324 | bf 31,0f |
325 | lbz 6,4(4) |
326 | stb 6,4(3) |
327 | .align 4 |
328 | 0: |
329 | /* Return original dst pointer. */ |
330 | ld 3,-16(1) |
331 | blr |
332 | |
333 | .align 4 |
334 | .L6: |
335 | |
336 | /* Copy doublewords where the destination is aligned but the source is |
337 | not. Use aligned doubleword loads from the source, shifted to realign |
338 | the data, to allow aligned destination stores. */ |
339 | addi 11,9,-1 /* loop DW count is one less than total */ |
340 | subf 5,10,12 |
341 | sldi 10,10,3 |
342 | mr 4,3 |
343 | srdi 8,11,2 /* calculate the 32 byte loop count */ |
344 | ld 6,0(5) |
345 | mtcrf 0x01,11 |
346 | cmpldi cr6,9,4 |
347 | mtctr 8 |
348 | ld 7,8(5) |
349 | subfic 9,10,64 |
350 | bf 30,1f |
351 | |
352 | /* there are at least two DWs to copy */ |
353 | #ifdef __LITTLE_ENDIAN__ |
354 | srd 0,6,10 |
355 | sld 8,7,9 |
356 | #else |
357 | sld 0,6,10 |
358 | srd 8,7,9 |
359 | #endif |
360 | or 0,0,8 |
361 | ld 6,16(5) |
362 | std 0,0(4) |
363 | #ifdef __LITTLE_ENDIAN__ |
364 | srd 0,7,10 |
365 | sld 8,6,9 |
366 | #else |
367 | sld 0,7,10 |
368 | srd 8,6,9 |
369 | #endif |
370 | or 0,0,8 |
371 | ld 7,24(5) |
372 | std 0,8(4) |
373 | addi 4,4,16 |
374 | addi 5,5,32 |
375 | blt cr6,8f /* if total DWs = 3, then bypass loop */ |
376 | bf 31,4f |
377 | /* there is a third DW to copy */ |
378 | #ifdef __LITTLE_ENDIAN__ |
379 | srd 0,6,10 |
380 | sld 8,7,9 |
381 | #else |
382 | sld 0,6,10 |
383 | srd 8,7,9 |
384 | #endif |
385 | or 0,0,8 |
386 | std 0,0(4) |
387 | mr 6,7 |
388 | ld 7,0(5) |
389 | addi 5,5,8 |
390 | addi 4,4,8 |
391 | beq cr6,8f /* if total DWs = 4, then bypass loop */ |
392 | b 4f |
393 | .align 4 |
394 | 1: |
395 | #ifdef __LITTLE_ENDIAN__ |
396 | srd 0,6,10 |
397 | sld 8,7,9 |
398 | #else |
399 | sld 0,6,10 |
400 | srd 8,7,9 |
401 | #endif |
402 | addi 5,5,16 |
403 | or 0,0,8 |
404 | bf 31,4f |
405 | mr 6,7 |
406 | ld 7,0(5) |
407 | addi 5,5,8 |
408 | std 0,0(4) |
409 | addi 4,4,8 |
410 | .align 4 |
411 | /* copy 32 bytes at a time */ |
412 | 4: |
413 | #ifdef __LITTLE_ENDIAN__ |
414 | srd 0,6,10 |
415 | sld 8,7,9 |
416 | #else |
417 | sld 0,6,10 |
418 | srd 8,7,9 |
419 | #endif |
420 | or 0,0,8 |
421 | ld 6,0(5) |
422 | std 0,0(4) |
423 | #ifdef __LITTLE_ENDIAN__ |
424 | srd 0,7,10 |
425 | sld 8,6,9 |
426 | #else |
427 | sld 0,7,10 |
428 | srd 8,6,9 |
429 | #endif |
430 | or 0,0,8 |
431 | ld 7,8(5) |
432 | std 0,8(4) |
433 | #ifdef __LITTLE_ENDIAN__ |
434 | srd 0,6,10 |
435 | sld 8,7,9 |
436 | #else |
437 | sld 0,6,10 |
438 | srd 8,7,9 |
439 | #endif |
440 | or 0,0,8 |
441 | ld 6,16(5) |
442 | std 0,16(4) |
443 | #ifdef __LITTLE_ENDIAN__ |
444 | srd 0,7,10 |
445 | sld 8,6,9 |
446 | #else |
447 | sld 0,7,10 |
448 | srd 8,6,9 |
449 | #endif |
450 | or 0,0,8 |
451 | ld 7,24(5) |
452 | std 0,24(4) |
453 | addi 5,5,32 |
454 | addi 4,4,32 |
455 | bdnz+ 4b |
456 | .align 4 |
457 | 8: |
458 | /* calculate and store the final DW */ |
459 | #ifdef __LITTLE_ENDIAN__ |
460 | srd 0,6,10 |
461 | sld 8,7,9 |
462 | #else |
463 | sld 0,6,10 |
464 | srd 8,7,9 |
465 | #endif |
466 | or 0,0,8 |
467 | std 0,0(4) |
468 | 3: |
469 | rldicr 0,31,0,60 |
470 | mtcrf 0x01,31 |
471 | bne cr1,.L9 /* If the tail is 0 bytes we are done! */ |
472 | /* Return original dst pointer. */ |
473 | ld 31,-8(1) |
474 | ld 3,-16(1) |
475 | blr |
476 | END_GEN_TB (MEMCPY,TB_TOCLESS) |
477 | libc_hidden_builtin_def (memcpy) |
478 | |