1/* Optimized memcpy implementation for PowerPC64/POWER7.
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21
22/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
23 Returns 'dst'. */
24
25#ifndef MEMCPY
26# define MEMCPY memcpy
27#endif
28
29#define dst 11 /* Use r11 so r3 kept unchanged. */
30#define src 4
31#define cnt 5
32
33 .machine power7
34ENTRY_TOCLESS (MEMCPY, 5)
35 CALL_MCOUNT 3
36
37 cmpldi cr1,cnt,31
38 neg 0,3
39 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
40 code. */
41
42/* Align copies using VSX instructions to quadword. It is to avoid alignment
43 traps when memcpy is used on non-cacheable memory (for instance, memory
44 mapped I/O). */
45 andi. 10,3,15
46 clrldi 11,4,60
47 cmpld cr6,10,11 /* SRC and DST alignments match? */
48
49 mr dst,3
50 bne cr6,L(copy_GE_32_unaligned)
51 beq L(aligned_copy)
52
53 mtocrf 0x01,0
54 clrldi 0,0,60
55
56/* Get the DST and SRC aligned to 16 bytes. */
571:
58 bf 31,2f
59 lbz 6,0(src)
60 addi src,src,1
61 stb 6,0(dst)
62 addi dst,dst,1
632:
64 bf 30,4f
65 lhz 6,0(src)
66 addi src,src,2
67 sth 6,0(dst)
68 addi dst,dst,2
694:
70 bf 29,8f
71 lwz 6,0(src)
72 addi src,src,4
73 stw 6,0(dst)
74 addi dst,dst,4
758:
76 bf 28,16f
77 ld 6,0(src)
78 addi src,src,8
79 std 6,0(dst)
80 addi dst,dst,8
8116:
82 subf cnt,0,cnt
83
84/* Main aligned copy loop. Copies 128 bytes at a time. */
85L(aligned_copy):
86 li 6,16
87 li 7,32
88 li 8,48
89 mtocrf 0x02,cnt
90 srdi 12,cnt,7
91 cmpdi 12,0
92 beq L(aligned_tail)
93 lvx 6,0,src
94 lvx 7,src,6
95 mtctr 12
96 b L(aligned_128loop)
97
98 .align 4
99L(aligned_128head):
100 /* for the 2nd + iteration of this loop. */
101 lvx 6,0,src
102 lvx 7,src,6
103L(aligned_128loop):
104 lvx 8,src,7
105 lvx 9,src,8
106 stvx 6,0,dst
107 addi src,src,64
108 stvx 7,dst,6
109 stvx 8,dst,7
110 stvx 9,dst,8
111 lvx 6,0,src
112 lvx 7,src,6
113 addi dst,dst,64
114 lvx 8,src,7
115 lvx 9,src,8
116 addi src,src,64
117 stvx 6,0,dst
118 stvx 7,dst,6
119 stvx 8,dst,7
120 stvx 9,dst,8
121 addi dst,dst,64
122 bdnz L(aligned_128head)
123
124L(aligned_tail):
125 mtocrf 0x01,cnt
126 bf 25,32f
127 lvx 6,0,src
128 lvx 7,src,6
129 lvx 8,src,7
130 lvx 9,src,8
131 addi src,src,64
132 stvx 6,0,dst
133 stvx 7,dst,6
134 stvx 8,dst,7
135 stvx 9,dst,8
136 addi dst,dst,64
13732:
138 bf 26,16f
139 lvx 6,0,src
140 lvx 7,src,6
141 addi src,src,32
142 stvx 6,0,dst
143 stvx 7,dst,6
144 addi dst,dst,32
14516:
146 bf 27,8f
147 lvx 6,0,src
148 addi src,src,16
149 stvx 6,0,dst
150 addi dst,dst,16
1518:
152 bf 28,4f
153 ld 6,0(src)
154 addi src,src,8
155 std 6,0(dst)
156 addi dst,dst,8
1574: /* Copies 4~7 bytes. */
158 bf 29,L(tail2)
159 lwz 6,0(src)
160 stw 6,0(dst)
161 bf 30,L(tail5)
162 lhz 7,4(src)
163 sth 7,4(dst)
164 bflr 31
165 lbz 8,6(src)
166 stb 8,6(dst)
167 /* Return original DST pointer. */
168 blr
169
170
171/* Handle copies of 0~31 bytes. */
172 .align 4
173L(copy_LT_32):
174 mr dst,3
175 cmpldi cr6,cnt,8
176 mtocrf 0x01,cnt
177 ble cr6,L(copy_LE_8)
178
179 /* At least 9 bytes to go. */
180 neg 8,4
181 andi. 0,8,3
182 cmpldi cr1,cnt,16
183 beq L(copy_LT_32_aligned)
184
185 /* Force 4-byte alignment for SRC. */
186 mtocrf 0x01,0
187 subf cnt,0,cnt
1882:
189 bf 30,1f
190 lhz 6,0(src)
191 addi src,src,2
192 sth 6,0(dst)
193 addi dst,dst,2
1941:
195 bf 31,L(end_4bytes_alignment)
196 lbz 6,0(src)
197 addi src,src,1
198 stb 6,0(dst)
199 addi dst,dst,1
200
201 .align 4
202L(end_4bytes_alignment):
203 cmpldi cr1,cnt,16
204 mtocrf 0x01,cnt
205
206L(copy_LT_32_aligned):
207 /* At least 6 bytes to go, and SRC is word-aligned. */
208 blt cr1,8f
209
210 /* Copy 16 bytes. */
211 lwz 6,0(src)
212 lwz 7,4(src)
213 stw 6,0(dst)
214 lwz 8,8(src)
215 stw 7,4(dst)
216 lwz 6,12(src)
217 addi src,src,16
218 stw 8,8(dst)
219 stw 6,12(dst)
220 addi dst,dst,16
2218: /* Copy 8 bytes. */
222 bf 28,L(tail4)
223 lwz 6,0(src)
224 lwz 7,4(src)
225 addi src,src,8
226 stw 6,0(dst)
227 stw 7,4(dst)
228 addi dst,dst,8
229
230 .align 4
231/* Copies 4~7 bytes. */
232L(tail4):
233 bf 29,L(tail2)
234 lwz 6,0(src)
235 stw 6,0(dst)
236 bf 30,L(tail5)
237 lhz 7,4(src)
238 sth 7,4(dst)
239 bflr 31
240 lbz 8,6(src)
241 stb 8,6(dst)
242 /* Return original DST pointer. */
243 blr
244
245 .align 4
246/* Copies 2~3 bytes. */
247L(tail2):
248 bf 30,1f
249 lhz 6,0(src)
250 sth 6,0(dst)
251 bflr 31
252 lbz 7,2(src)
253 stb 7,2(dst)
254 blr
255
256 .align 4
257L(tail5):
258 bflr 31
259 lbz 6,4(src)
260 stb 6,4(dst)
261 blr
262
263 .align 4
2641:
265 bflr 31
266 lbz 6,0(src)
267 stb 6,0(dst)
268 /* Return original DST pointer. */
269 blr
270
271
272/* Handles copies of 0~8 bytes. */
273 .align 4
274L(copy_LE_8):
275 bne cr6,L(tail4)
276
277 /* Though we could've used ld/std here, they are still
278 slow for unaligned cases. */
279
280 lwz 6,0(src)
281 lwz 7,4(src)
282 stw 6,0(dst)
283 stw 7,4(dst)
284 blr
285
286
287/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
288 SRC is not. Use aligned quadword loads from SRC, shifted to realign
289 the data, allowing for aligned DST stores. */
290 .align 4
291L(copy_GE_32_unaligned):
292 clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
293 srdi 9,cnt,4 /* Number of full quadwords remaining. */
294
295 beq L(copy_GE_32_unaligned_cont)
296
297 /* DST is not quadword aligned, get it aligned. */
298
299 mtocrf 0x01,0
300 subf cnt,0,cnt
301
302 /* Vector instructions work best when proper alignment (16-bytes)
303 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3041:
305 bf 31,2f
306 lbz 6,0(src)
307 addi src,src,1
308 stb 6,0(dst)
309 addi dst,dst,1
3102:
311 bf 30,4f
312 lhz 6,0(src)
313 addi src,src,2
314 sth 6,0(dst)
315 addi dst,dst,2
3164:
317 bf 29,8f
318 lwz 6,0(src)
319 addi src,src,4
320 stw 6,0(dst)
321 addi dst,dst,4
3228:
323 bf 28,0f
324 ld 6,0(src)
325 addi src,src,8
326 std 6,0(dst)
327 addi dst,dst,8
3280:
329 srdi 9,cnt,4 /* Number of full quadwords remaining. */
330
331 /* The proper alignment is present, it is OK to copy the bytes now. */
332L(copy_GE_32_unaligned_cont):
333
334 /* Setup two indexes to speed up the indexed vector operations. */
335 clrldi 10,cnt,60
336 li 6,16 /* Index for 16-bytes offsets. */
337 li 7,32 /* Index for 32-bytes offsets. */
338 cmpldi cr1,10,0
339 srdi 8,cnt,5 /* Setup the loop counter. */
340 mtocrf 0x01,9
341 cmpldi cr6,9,1
342#ifdef __LITTLE_ENDIAN__
343 lvsr 5,0,src
344#else
345 lvsl 5,0,src
346#endif
347 lvx 3,0,src
348 li 0,0
349 bf 31,L(setup_unaligned_loop)
350
351 /* Copy another 16 bytes to align to 32-bytes due to the loop. */
352 lvx 4,src,6
353#ifdef __LITTLE_ENDIAN__
354 vperm 6,4,3,5
355#else
356 vperm 6,3,4,5
357#endif
358 addi src,src,16
359 stvx 6,0,dst
360 addi dst,dst,16
361 vor 3,4,4
362 clrrdi 0,src,60
363
364L(setup_unaligned_loop):
365 mtctr 8
366 ble cr6,L(end_unaligned_loop)
367
368 /* Copy 32 bytes at a time using vector instructions. */
369 .align 4
370L(unaligned_loop):
371
372 /* Note: vr6/vr10 may contain data that was already copied,
373 but in order to get proper alignment, we may have to copy
374 some portions again. This is faster than having unaligned
375 vector instructions though. */
376
377 lvx 4,src,6
378#ifdef __LITTLE_ENDIAN__
379 vperm 6,4,3,5
380#else
381 vperm 6,3,4,5
382#endif
383 lvx 3,src,7
384#ifdef __LITTLE_ENDIAN__
385 vperm 10,3,4,5
386#else
387 vperm 10,4,3,5
388#endif
389 addi src,src,32
390 stvx 6,0,dst
391 stvx 10,dst,6
392 addi dst,dst,32
393 bdnz L(unaligned_loop)
394
395 clrrdi 0,src,60
396
397 .align 4
398L(end_unaligned_loop):
399
400 /* Check for tail bytes. */
401 mtocrf 0x01,cnt
402 beqlr cr1
403
404 add src,src,0
405
406 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
407 /* Copy 8 bytes. */
408 bf 28,4f
409 lwz 6,0(src)
410 lwz 7,4(src)
411 addi src,src,8
412 stw 6,0(dst)
413 stw 7,4(dst)
414 addi dst,dst,8
4154: /* Copy 4~7 bytes. */
416 bf 29,L(tail2)
417 lwz 6,0(src)
418 stw 6,0(dst)
419 bf 30,L(tail5)
420 lhz 7,4(src)
421 sth 7,4(dst)
422 bflr 31
423 lbz 8,6(src)
424 stb 8,6(dst)
425 /* Return original DST pointer. */
426 blr
427
428END_GEN_TB (MEMCPY,TB_TOCLESS)
429libc_hidden_builtin_def (memcpy)
430

source code of glibc/sysdeps/powerpc/powerpc64/power7/memcpy.S