1 | /* Optimized memcpy implementation for PowerPC64/POWER7. |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | |
22 | /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); |
23 | Returns 'dst'. */ |
24 | |
25 | #ifndef MEMCPY |
26 | # define MEMCPY memcpy |
27 | #endif |
28 | |
29 | #define dst 11 /* Use r11 so r3 kept unchanged. */ |
30 | #define src 4 |
31 | #define cnt 5 |
32 | |
33 | .machine power7 |
34 | ENTRY_TOCLESS (MEMCPY, 5) |
35 | CALL_MCOUNT 3 |
36 | |
37 | cmpldi cr1,cnt,31 |
38 | neg 0,3 |
39 | ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move |
40 | code. */ |
41 | |
42 | /* Align copies using VSX instructions to quadword. It is to avoid alignment |
43 | traps when memcpy is used on non-cacheable memory (for instance, memory |
44 | mapped I/O). */ |
45 | andi. 10,3,15 |
46 | clrldi 11,4,60 |
47 | cmpld cr6,10,11 /* SRC and DST alignments match? */ |
48 | |
49 | mr dst,3 |
50 | bne cr6,L(copy_GE_32_unaligned) |
51 | beq L(aligned_copy) |
52 | |
53 | mtocrf 0x01,0 |
54 | clrldi 0,0,60 |
55 | |
56 | /* Get the DST and SRC aligned to 16 bytes. */ |
57 | 1: |
58 | bf 31,2f |
59 | lbz 6,0(src) |
60 | addi src,src,1 |
61 | stb 6,0(dst) |
62 | addi dst,dst,1 |
63 | 2: |
64 | bf 30,4f |
65 | lhz 6,0(src) |
66 | addi src,src,2 |
67 | sth 6,0(dst) |
68 | addi dst,dst,2 |
69 | 4: |
70 | bf 29,8f |
71 | lwz 6,0(src) |
72 | addi src,src,4 |
73 | stw 6,0(dst) |
74 | addi dst,dst,4 |
75 | 8: |
76 | bf 28,16f |
77 | ld 6,0(src) |
78 | addi src,src,8 |
79 | std 6,0(dst) |
80 | addi dst,dst,8 |
81 | 16: |
82 | subf cnt,0,cnt |
83 | |
84 | /* Main aligned copy loop. Copies 128 bytes at a time. */ |
85 | L(aligned_copy): |
86 | li 6,16 |
87 | li 7,32 |
88 | li 8,48 |
89 | mtocrf 0x02,cnt |
90 | srdi 12,cnt,7 |
91 | cmpdi 12,0 |
92 | beq L(aligned_tail) |
93 | lvx 6,0,src |
94 | lvx 7,src,6 |
95 | mtctr 12 |
96 | b L(aligned_128loop) |
97 | |
98 | .align 4 |
99 | L(aligned_128head): |
100 | /* for the 2nd + iteration of this loop. */ |
101 | lvx 6,0,src |
102 | lvx 7,src,6 |
103 | L(aligned_128loop): |
104 | lvx 8,src,7 |
105 | lvx 9,src,8 |
106 | stvx 6,0,dst |
107 | addi src,src,64 |
108 | stvx 7,dst,6 |
109 | stvx 8,dst,7 |
110 | stvx 9,dst,8 |
111 | lvx 6,0,src |
112 | lvx 7,src,6 |
113 | addi dst,dst,64 |
114 | lvx 8,src,7 |
115 | lvx 9,src,8 |
116 | addi src,src,64 |
117 | stvx 6,0,dst |
118 | stvx 7,dst,6 |
119 | stvx 8,dst,7 |
120 | stvx 9,dst,8 |
121 | addi dst,dst,64 |
122 | bdnz L(aligned_128head) |
123 | |
124 | L(aligned_tail): |
125 | mtocrf 0x01,cnt |
126 | bf 25,32f |
127 | lvx 6,0,src |
128 | lvx 7,src,6 |
129 | lvx 8,src,7 |
130 | lvx 9,src,8 |
131 | addi src,src,64 |
132 | stvx 6,0,dst |
133 | stvx 7,dst,6 |
134 | stvx 8,dst,7 |
135 | stvx 9,dst,8 |
136 | addi dst,dst,64 |
137 | 32: |
138 | bf 26,16f |
139 | lvx 6,0,src |
140 | lvx 7,src,6 |
141 | addi src,src,32 |
142 | stvx 6,0,dst |
143 | stvx 7,dst,6 |
144 | addi dst,dst,32 |
145 | 16: |
146 | bf 27,8f |
147 | lvx 6,0,src |
148 | addi src,src,16 |
149 | stvx 6,0,dst |
150 | addi dst,dst,16 |
151 | 8: |
152 | bf 28,4f |
153 | ld 6,0(src) |
154 | addi src,src,8 |
155 | std 6,0(dst) |
156 | addi dst,dst,8 |
157 | 4: /* Copies 4~7 bytes. */ |
158 | bf 29,L(tail2) |
159 | lwz 6,0(src) |
160 | stw 6,0(dst) |
161 | bf 30,L(tail5) |
162 | lhz 7,4(src) |
163 | sth 7,4(dst) |
164 | bflr 31 |
165 | lbz 8,6(src) |
166 | stb 8,6(dst) |
167 | /* Return original DST pointer. */ |
168 | blr |
169 | |
170 | |
171 | /* Handle copies of 0~31 bytes. */ |
172 | .align 4 |
173 | L(copy_LT_32): |
174 | mr dst,3 |
175 | cmpldi cr6,cnt,8 |
176 | mtocrf 0x01,cnt |
177 | ble cr6,L(copy_LE_8) |
178 | |
179 | /* At least 9 bytes to go. */ |
180 | neg 8,4 |
181 | andi. 0,8,3 |
182 | cmpldi cr1,cnt,16 |
183 | beq L(copy_LT_32_aligned) |
184 | |
185 | /* Force 4-byte alignment for SRC. */ |
186 | mtocrf 0x01,0 |
187 | subf cnt,0,cnt |
188 | 2: |
189 | bf 30,1f |
190 | lhz 6,0(src) |
191 | addi src,src,2 |
192 | sth 6,0(dst) |
193 | addi dst,dst,2 |
194 | 1: |
195 | bf 31,L(end_4bytes_alignment) |
196 | lbz 6,0(src) |
197 | addi src,src,1 |
198 | stb 6,0(dst) |
199 | addi dst,dst,1 |
200 | |
201 | .align 4 |
202 | L(end_4bytes_alignment): |
203 | cmpldi cr1,cnt,16 |
204 | mtocrf 0x01,cnt |
205 | |
206 | L(copy_LT_32_aligned): |
207 | /* At least 6 bytes to go, and SRC is word-aligned. */ |
208 | blt cr1,8f |
209 | |
210 | /* Copy 16 bytes. */ |
211 | lwz 6,0(src) |
212 | lwz 7,4(src) |
213 | stw 6,0(dst) |
214 | lwz 8,8(src) |
215 | stw 7,4(dst) |
216 | lwz 6,12(src) |
217 | addi src,src,16 |
218 | stw 8,8(dst) |
219 | stw 6,12(dst) |
220 | addi dst,dst,16 |
221 | 8: /* Copy 8 bytes. */ |
222 | bf 28,L(tail4) |
223 | lwz 6,0(src) |
224 | lwz 7,4(src) |
225 | addi src,src,8 |
226 | stw 6,0(dst) |
227 | stw 7,4(dst) |
228 | addi dst,dst,8 |
229 | |
230 | .align 4 |
231 | /* Copies 4~7 bytes. */ |
232 | L(tail4): |
233 | bf 29,L(tail2) |
234 | lwz 6,0(src) |
235 | stw 6,0(dst) |
236 | bf 30,L(tail5) |
237 | lhz 7,4(src) |
238 | sth 7,4(dst) |
239 | bflr 31 |
240 | lbz 8,6(src) |
241 | stb 8,6(dst) |
242 | /* Return original DST pointer. */ |
243 | blr |
244 | |
245 | .align 4 |
246 | /* Copies 2~3 bytes. */ |
247 | L(tail2): |
248 | bf 30,1f |
249 | lhz 6,0(src) |
250 | sth 6,0(dst) |
251 | bflr 31 |
252 | lbz 7,2(src) |
253 | stb 7,2(dst) |
254 | blr |
255 | |
256 | .align 4 |
257 | L(tail5): |
258 | bflr 31 |
259 | lbz 6,4(src) |
260 | stb 6,4(dst) |
261 | blr |
262 | |
263 | .align 4 |
264 | 1: |
265 | bflr 31 |
266 | lbz 6,0(src) |
267 | stb 6,0(dst) |
268 | /* Return original DST pointer. */ |
269 | blr |
270 | |
271 | |
272 | /* Handles copies of 0~8 bytes. */ |
273 | .align 4 |
274 | L(copy_LE_8): |
275 | bne cr6,L(tail4) |
276 | |
277 | /* Though we could've used ld/std here, they are still |
278 | slow for unaligned cases. */ |
279 | |
280 | lwz 6,0(src) |
281 | lwz 7,4(src) |
282 | stw 6,0(dst) |
283 | stw 7,4(dst) |
284 | blr |
285 | |
286 | |
287 | /* Handle copies of 32+ bytes where DST is aligned (to quadword) but |
288 | SRC is not. Use aligned quadword loads from SRC, shifted to realign |
289 | the data, allowing for aligned DST stores. */ |
290 | .align 4 |
291 | L(copy_GE_32_unaligned): |
292 | clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ |
293 | srdi 9,cnt,4 /* Number of full quadwords remaining. */ |
294 | |
295 | beq L(copy_GE_32_unaligned_cont) |
296 | |
297 | /* DST is not quadword aligned, get it aligned. */ |
298 | |
299 | mtocrf 0x01,0 |
300 | subf cnt,0,cnt |
301 | |
302 | /* Vector instructions work best when proper alignment (16-bytes) |
303 | is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ |
304 | 1: |
305 | bf 31,2f |
306 | lbz 6,0(src) |
307 | addi src,src,1 |
308 | stb 6,0(dst) |
309 | addi dst,dst,1 |
310 | 2: |
311 | bf 30,4f |
312 | lhz 6,0(src) |
313 | addi src,src,2 |
314 | sth 6,0(dst) |
315 | addi dst,dst,2 |
316 | 4: |
317 | bf 29,8f |
318 | lwz 6,0(src) |
319 | addi src,src,4 |
320 | stw 6,0(dst) |
321 | addi dst,dst,4 |
322 | 8: |
323 | bf 28,0f |
324 | ld 6,0(src) |
325 | addi src,src,8 |
326 | std 6,0(dst) |
327 | addi dst,dst,8 |
328 | 0: |
329 | srdi 9,cnt,4 /* Number of full quadwords remaining. */ |
330 | |
331 | /* The proper alignment is present, it is OK to copy the bytes now. */ |
332 | L(copy_GE_32_unaligned_cont): |
333 | |
334 | /* Setup two indexes to speed up the indexed vector operations. */ |
335 | clrldi 10,cnt,60 |
336 | li 6,16 /* Index for 16-bytes offsets. */ |
337 | li 7,32 /* Index for 32-bytes offsets. */ |
338 | cmpldi cr1,10,0 |
339 | srdi 8,cnt,5 /* Setup the loop counter. */ |
340 | mtocrf 0x01,9 |
341 | cmpldi cr6,9,1 |
342 | #ifdef __LITTLE_ENDIAN__ |
343 | lvsr 5,0,src |
344 | #else |
345 | lvsl 5,0,src |
346 | #endif |
347 | lvx 3,0,src |
348 | li 0,0 |
349 | bf 31,L(setup_unaligned_loop) |
350 | |
351 | /* Copy another 16 bytes to align to 32-bytes due to the loop. */ |
352 | lvx 4,src,6 |
353 | #ifdef __LITTLE_ENDIAN__ |
354 | vperm 6,4,3,5 |
355 | #else |
356 | vperm 6,3,4,5 |
357 | #endif |
358 | addi src,src,16 |
359 | stvx 6,0,dst |
360 | addi dst,dst,16 |
361 | vor 3,4,4 |
362 | clrrdi 0,src,60 |
363 | |
364 | L(setup_unaligned_loop): |
365 | mtctr 8 |
366 | ble cr6,L(end_unaligned_loop) |
367 | |
368 | /* Copy 32 bytes at a time using vector instructions. */ |
369 | .align 4 |
370 | L(unaligned_loop): |
371 | |
372 | /* Note: vr6/vr10 may contain data that was already copied, |
373 | but in order to get proper alignment, we may have to copy |
374 | some portions again. This is faster than having unaligned |
375 | vector instructions though. */ |
376 | |
377 | lvx 4,src,6 |
378 | #ifdef __LITTLE_ENDIAN__ |
379 | vperm 6,4,3,5 |
380 | #else |
381 | vperm 6,3,4,5 |
382 | #endif |
383 | lvx 3,src,7 |
384 | #ifdef __LITTLE_ENDIAN__ |
385 | vperm 10,3,4,5 |
386 | #else |
387 | vperm 10,4,3,5 |
388 | #endif |
389 | addi src,src,32 |
390 | stvx 6,0,dst |
391 | stvx 10,dst,6 |
392 | addi dst,dst,32 |
393 | bdnz L(unaligned_loop) |
394 | |
395 | clrrdi 0,src,60 |
396 | |
397 | .align 4 |
398 | L(end_unaligned_loop): |
399 | |
400 | /* Check for tail bytes. */ |
401 | mtocrf 0x01,cnt |
402 | beqlr cr1 |
403 | |
404 | add src,src,0 |
405 | |
406 | /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ |
407 | /* Copy 8 bytes. */ |
408 | bf 28,4f |
409 | lwz 6,0(src) |
410 | lwz 7,4(src) |
411 | addi src,src,8 |
412 | stw 6,0(dst) |
413 | stw 7,4(dst) |
414 | addi dst,dst,8 |
415 | 4: /* Copy 4~7 bytes. */ |
416 | bf 29,L(tail2) |
417 | lwz 6,0(src) |
418 | stw 6,0(dst) |
419 | bf 30,L(tail5) |
420 | lhz 7,4(src) |
421 | sth 7,4(dst) |
422 | bflr 31 |
423 | lbz 8,6(src) |
424 | stb 8,6(dst) |
425 | /* Return original DST pointer. */ |
426 | blr |
427 | |
428 | END_GEN_TB (MEMCPY,TB_TOCLESS) |
429 | libc_hidden_builtin_def (memcpy) |
430 | |