1 | /* Optimized mempcpy implementation for POWER7. |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | |
22 | /* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]); |
23 | Returns 'dst' + 'len'. */ |
24 | |
25 | #ifndef MEMPCPY |
26 | # define MEMPCPY __mempcpy |
27 | #endif |
28 | .machine power7 |
29 | ENTRY_TOCLESS (MEMPCPY, 5) |
30 | CALL_MCOUNT 3 |
31 | |
32 | cmpldi cr1,5,31 |
33 | neg 0,3 |
34 | std 3,-16(1) |
35 | std 31,-8(1) |
36 | cfi_offset(31,-8) |
37 | ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move |
38 | code. */ |
39 | |
40 | andi. 11,3,7 /* Check alignment of DST. */ |
41 | |
42 | |
43 | clrldi 10,4,61 /* Check alignment of SRC. */ |
44 | cmpld cr6,10,11 /* SRC and DST alignments match? */ |
45 | mr 12,4 |
46 | mr 31,5 |
47 | bne cr6,L(copy_GE_32_unaligned) |
48 | |
49 | srdi 9,5,3 /* Number of full quadwords remaining. */ |
50 | |
51 | beq L(copy_GE_32_aligned_cont) |
52 | |
53 | clrldi 0,0,61 |
54 | mtcrf 0x01,0 |
55 | subf 31,0,5 |
56 | |
57 | /* Get the SRC aligned to 8 bytes. */ |
58 | |
59 | 1: bf 31,2f |
60 | lbz 6,0(12) |
61 | addi 12,12,1 |
62 | stb 6,0(3) |
63 | addi 3,3,1 |
64 | 2: bf 30,4f |
65 | lhz 6,0(12) |
66 | addi 12,12,2 |
67 | sth 6,0(3) |
68 | addi 3,3,2 |
69 | 4: bf 29,0f |
70 | lwz 6,0(12) |
71 | addi 12,12,4 |
72 | stw 6,0(3) |
73 | addi 3,3,4 |
74 | 0: |
75 | clrldi 10,12,61 /* Check alignment of SRC again. */ |
76 | srdi 9,31,3 /* Number of full doublewords remaining. */ |
77 | |
78 | L(copy_GE_32_aligned_cont): |
79 | |
80 | clrldi 11,31,61 |
81 | mtcrf 0x01,9 |
82 | |
83 | srdi 8,31,5 |
84 | cmpldi cr1,9,4 |
85 | cmpldi cr6,11,0 |
86 | mr 11,12 |
87 | |
88 | /* Copy 1~3 doublewords so the main loop starts |
89 | at a multiple of 32 bytes. */ |
90 | |
91 | bf 30,1f |
92 | ld 6,0(12) |
93 | ld 7,8(12) |
94 | addi 11,12,16 |
95 | mtctr 8 |
96 | std 6,0(3) |
97 | std 7,8(3) |
98 | addi 10,3,16 |
99 | bf 31,4f |
100 | ld 0,16(12) |
101 | std 0,16(3) |
102 | blt cr1,3f |
103 | addi 11,12,24 |
104 | addi 10,3,24 |
105 | b 4f |
106 | |
107 | .align 4 |
108 | 1: /* Copy 1 doubleword and set the counter. */ |
109 | mr 10,3 |
110 | mtctr 8 |
111 | bf 31,4f |
112 | ld 6,0(12) |
113 | addi 11,12,8 |
114 | std 6,0(3) |
115 | addi 10,3,8 |
116 | |
117 | /* Main aligned copy loop. Copies 32-bytes at a time. */ |
118 | .align 4 |
119 | 4: |
120 | ld 6,0(11) |
121 | ld 7,8(11) |
122 | ld 8,16(11) |
123 | ld 0,24(11) |
124 | addi 11,11,32 |
125 | |
126 | std 6,0(10) |
127 | std 7,8(10) |
128 | std 8,16(10) |
129 | std 0,24(10) |
130 | addi 10,10,32 |
131 | bdnz 4b |
132 | 3: |
133 | |
134 | /* Check for tail bytes. */ |
135 | rldicr 0,31,0,60 |
136 | mtcrf 0x01,31 |
137 | beq cr6,0f |
138 | |
139 | .L9: |
140 | add 3,3,0 |
141 | add 12,12,0 |
142 | |
143 | /* At this point we have a tail of 0-7 bytes and we know that the |
144 | destination is doubleword-aligned. */ |
145 | 4: /* Copy 4 bytes. */ |
146 | bf 29,2f |
147 | |
148 | lwz 6,0(12) |
149 | addi 12,12,4 |
150 | stw 6,0(3) |
151 | addi 3,3,4 |
152 | 2: /* Copy 2 bytes. */ |
153 | bf 30,1f |
154 | |
155 | lhz 6,0(12) |
156 | addi 12,12,2 |
157 | sth 6,0(3) |
158 | addi 3,3,2 |
159 | 1: /* Copy 1 byte. */ |
160 | bf 31,0f |
161 | |
162 | lbz 6,0(12) |
163 | stb 6,0(3) |
164 | 0: /* Return DST + LEN pointer. */ |
165 | ld 31,-8(1) |
166 | ld 3,-16(1) |
167 | add 3,3,5 |
168 | blr |
169 | |
170 | /* Handle copies of 0~31 bytes. */ |
171 | .align 4 |
172 | L(copy_LT_32): |
173 | cmpldi cr6,5,8 |
174 | mr 12,4 |
175 | mtcrf 0x01,5 |
176 | ble cr6,L(copy_LE_8) |
177 | |
178 | /* At least 9 bytes to go. */ |
179 | neg 8,4 |
180 | clrrdi 11,4,2 |
181 | andi. 0,8,3 |
182 | cmpldi cr1,5,16 |
183 | mr 10,5 |
184 | beq L(copy_LT_32_aligned) |
185 | |
186 | /* Force 4-bytes alignment for SRC. */ |
187 | mtocrf 0x01,0 |
188 | subf 10,0,5 |
189 | 2: bf 30,1f |
190 | |
191 | lhz 6,0(12) |
192 | addi 12,12,2 |
193 | sth 6,0(3) |
194 | addi 3,3,2 |
195 | 1: bf 31,L(end_4bytes_alignment) |
196 | |
197 | lbz 6,0(12) |
198 | addi 12,12,1 |
199 | stb 6,0(3) |
200 | addi 3,3,1 |
201 | |
202 | .align 4 |
203 | L(end_4bytes_alignment): |
204 | cmpldi cr1,10,16 |
205 | mtcrf 0x01,10 |
206 | |
207 | L(copy_LT_32_aligned): |
208 | /* At least 6 bytes to go, and SRC is word-aligned. */ |
209 | blt cr1,8f |
210 | |
211 | /* Copy 16 bytes. */ |
212 | lwz 6,0(12) |
213 | lwz 7,4(12) |
214 | stw 6,0(3) |
215 | lwz 8,8(12) |
216 | stw 7,4(3) |
217 | lwz 6,12(12) |
218 | addi 12,12,16 |
219 | stw 8,8(3) |
220 | stw 6,12(3) |
221 | addi 3,3,16 |
222 | 8: /* Copy 8 bytes. */ |
223 | bf 28,4f |
224 | |
225 | lwz 6,0(12) |
226 | lwz 7,4(12) |
227 | addi 12,12,8 |
228 | stw 6,0(3) |
229 | stw 7,4(3) |
230 | addi 3,3,8 |
231 | 4: /* Copy 4 bytes. */ |
232 | bf 29,2f |
233 | |
234 | lwz 6,0(12) |
235 | addi 12,12,4 |
236 | stw 6,0(3) |
237 | addi 3,3,4 |
238 | 2: /* Copy 2-3 bytes. */ |
239 | bf 30,1f |
240 | |
241 | lhz 6,0(12) |
242 | sth 6,0(3) |
243 | bf 31,0f |
244 | lbz 7,2(12) |
245 | stb 7,2(3) |
246 | ld 3,-16(1) |
247 | add 3,3,5 |
248 | blr |
249 | |
250 | .align 4 |
251 | 1: /* Copy 1 byte. */ |
252 | bf 31,0f |
253 | |
254 | lbz 6,0(12) |
255 | stb 6,0(3) |
256 | 0: /* Return DST + LEN pointer. */ |
257 | ld 3,-16(1) |
258 | add 3,3,5 |
259 | blr |
260 | |
261 | /* Handles copies of 0~8 bytes. */ |
262 | .align 4 |
263 | L(copy_LE_8): |
264 | bne cr6,4f |
265 | |
266 | /* Though we could've used ld/std here, they are still |
267 | slow for unaligned cases. */ |
268 | |
269 | lwz 6,0(4) |
270 | lwz 7,4(4) |
271 | stw 6,0(3) |
272 | stw 7,4(3) |
273 | ld 3,-16(1) /* Return DST + LEN pointer. */ |
274 | add 3,3,5 |
275 | blr |
276 | |
277 | .align 4 |
278 | 4: /* Copies 4~7 bytes. */ |
279 | bf 29,2b |
280 | |
281 | lwz 6,0(4) |
282 | stw 6,0(3) |
283 | bf 30,5f |
284 | lhz 7,4(4) |
285 | sth 7,4(3) |
286 | bf 31,0f |
287 | lbz 8,6(4) |
288 | stb 8,6(3) |
289 | ld 3,-16(1) |
290 | add 3,3,5 |
291 | blr |
292 | |
293 | .align 4 |
294 | 5: /* Copy 1 byte. */ |
295 | bf 31,0f |
296 | |
297 | lbz 6,4(4) |
298 | stb 6,4(3) |
299 | |
300 | 0: /* Return DST + LEN pointer. */ |
301 | ld 3,-16(1) |
302 | add 3,3,5 |
303 | blr |
304 | |
305 | /* Handle copies of 32+ bytes where DST is aligned (to quadword) but |
306 | SRC is not. Use aligned quadword loads from SRC, shifted to realign |
307 | the data, allowing for aligned DST stores. */ |
308 | .align 4 |
309 | L(copy_GE_32_unaligned): |
310 | clrldi 0,0,60 /* Number of bytes until the 1st |
311 | quadword. */ |
312 | andi. 11,3,15 /* Check alignment of DST (against |
313 | quadwords). */ |
314 | srdi 9,5,4 /* Number of full quadwords remaining. */ |
315 | |
316 | beq L(copy_GE_32_unaligned_cont) |
317 | |
318 | /* SRC is not quadword aligned, get it aligned. */ |
319 | |
320 | mtcrf 0x01,0 |
321 | subf 31,0,5 |
322 | |
323 | /* Vector instructions work best when proper alignment (16-bytes) |
324 | is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ |
325 | 1: /* Copy 1 byte. */ |
326 | bf 31,2f |
327 | |
328 | lbz 6,0(12) |
329 | addi 12,12,1 |
330 | stb 6,0(3) |
331 | addi 3,3,1 |
332 | 2: /* Copy 2 bytes. */ |
333 | bf 30,4f |
334 | |
335 | lhz 6,0(12) |
336 | addi 12,12,2 |
337 | sth 6,0(3) |
338 | addi 3,3,2 |
339 | 4: /* Copy 4 bytes. */ |
340 | bf 29,8f |
341 | |
342 | lwz 6,0(12) |
343 | addi 12,12,4 |
344 | stw 6,0(3) |
345 | addi 3,3,4 |
346 | 8: /* Copy 8 bytes. */ |
347 | bf 28,0f |
348 | |
349 | ld 6,0(12) |
350 | addi 12,12,8 |
351 | std 6,0(3) |
352 | addi 3,3,8 |
353 | 0: |
354 | clrldi 10,12,60 /* Check alignment of SRC. */ |
355 | srdi 9,31,4 /* Number of full quadwords remaining. */ |
356 | |
357 | /* The proper alignment is present, it is OK to copy the bytes now. */ |
358 | L(copy_GE_32_unaligned_cont): |
359 | |
360 | /* Setup two indexes to speed up the indexed vector operations. */ |
361 | clrldi 11,31,60 |
362 | li 6,16 /* Index for 16-bytes offsets. */ |
363 | li 7,32 /* Index for 32-bytes offsets. */ |
364 | cmpldi cr1,11,0 |
365 | srdi 8,31,5 /* Setup the loop counter. */ |
366 | mr 10,3 |
367 | mr 11,12 |
368 | mtcrf 0x01,9 |
369 | cmpldi cr6,9,1 |
370 | #ifdef __LITTLE_ENDIAN__ |
371 | lvsr 5,0,12 |
372 | #else |
373 | lvsl 5,0,12 |
374 | #endif |
375 | lvx 3,0,12 |
376 | bf 31,L(setup_unaligned_loop) |
377 | |
378 | /* Copy another 16 bytes to align to 32-bytes due to the loop . */ |
379 | lvx 4,12,6 |
380 | #ifdef __LITTLE_ENDIAN__ |
381 | vperm 6,4,3,5 |
382 | #else |
383 | vperm 6,3,4,5 |
384 | #endif |
385 | addi 11,12,16 |
386 | addi 10,3,16 |
387 | stvx 6,0,3 |
388 | vor 3,4,4 |
389 | |
390 | L(setup_unaligned_loop): |
391 | mtctr 8 |
392 | ble cr6,L(end_unaligned_loop) |
393 | |
394 | /* Copy 32 bytes at a time using vector instructions. */ |
395 | .align 4 |
396 | L(unaligned_loop): |
397 | |
398 | /* Note: vr6/vr10 may contain data that was already copied, |
399 | but in order to get proper alignment, we may have to copy |
400 | some portions again. This is faster than having unaligned |
401 | vector instructions though. */ |
402 | |
403 | lvx 4,11,6 /* vr4 = r11+16. */ |
404 | #ifdef __LITTLE_ENDIAN__ |
405 | vperm 6,4,3,5 |
406 | #else |
407 | vperm 6,3,4,5 |
408 | #endif |
409 | lvx 3,11,7 /* vr3 = r11+32. */ |
410 | #ifdef __LITTLE_ENDIAN__ |
411 | vperm 10,3,4,5 |
412 | #else |
413 | vperm 10,4,3,5 |
414 | #endif |
415 | addi 11,11,32 |
416 | stvx 6,0,10 |
417 | stvx 10,10,6 |
418 | addi 10,10,32 |
419 | |
420 | bdnz L(unaligned_loop) |
421 | |
422 | .align 4 |
423 | L(end_unaligned_loop): |
424 | |
425 | /* Check for tail bytes. */ |
426 | rldicr 0,31,0,59 |
427 | mtcrf 0x01,31 |
428 | beq cr1,0f |
429 | |
430 | add 3,3,0 |
431 | add 12,12,0 |
432 | |
433 | /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ |
434 | 8: /* Copy 8 bytes. */ |
435 | bf 28,4f |
436 | |
437 | lwz 6,0(12) |
438 | lwz 7,4(12) |
439 | addi 12,12,8 |
440 | stw 6,0(3) |
441 | stw 7,4(3) |
442 | addi 3,3,8 |
443 | 4: /* Copy 4 bytes. */ |
444 | bf 29,2f |
445 | |
446 | lwz 6,0(12) |
447 | addi 12,12,4 |
448 | stw 6,0(3) |
449 | addi 3,3,4 |
450 | 2: /* Copy 2~3 bytes. */ |
451 | bf 30,1f |
452 | |
453 | lhz 6,0(12) |
454 | addi 12,12,2 |
455 | sth 6,0(3) |
456 | addi 3,3,2 |
457 | 1: /* Copy 1 byte. */ |
458 | bf 31,0f |
459 | |
460 | lbz 6,0(12) |
461 | stb 6,0(3) |
462 | 0: /* Return DST + LEN pointer. */ |
463 | ld 31,-8(1) |
464 | ld 3,-16(1) |
465 | add 3,3,5 |
466 | blr |
467 | |
468 | END_GEN_TB (MEMPCPY,TB_TOCLESS) |
469 | libc_hidden_def (__mempcpy) |
470 | weak_alias (__mempcpy, mempcpy) |
471 | libc_hidden_builtin_def (mempcpy) |
472 | |