1/* Optimized mempcpy implementation for POWER7.
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21
22/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
23 Returns 'dst' + 'len'. */
24
25#ifndef MEMPCPY
26# define MEMPCPY __mempcpy
27#endif
28 .machine power7
29ENTRY_TOCLESS (MEMPCPY, 5)
30 CALL_MCOUNT 3
31
32 cmpldi cr1,5,31
33 neg 0,3
34 std 3,-16(1)
35 std 31,-8(1)
36 cfi_offset(31,-8)
37 ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
38 code. */
39
40 andi. 11,3,7 /* Check alignment of DST. */
41
42
43 clrldi 10,4,61 /* Check alignment of SRC. */
44 cmpld cr6,10,11 /* SRC and DST alignments match? */
45 mr 12,4
46 mr 31,5
47 bne cr6,L(copy_GE_32_unaligned)
48
49 srdi 9,5,3 /* Number of full quadwords remaining. */
50
51 beq L(copy_GE_32_aligned_cont)
52
53 clrldi 0,0,61
54 mtcrf 0x01,0
55 subf 31,0,5
56
57 /* Get the SRC aligned to 8 bytes. */
58
591: bf 31,2f
60 lbz 6,0(12)
61 addi 12,12,1
62 stb 6,0(3)
63 addi 3,3,1
642: bf 30,4f
65 lhz 6,0(12)
66 addi 12,12,2
67 sth 6,0(3)
68 addi 3,3,2
694: bf 29,0f
70 lwz 6,0(12)
71 addi 12,12,4
72 stw 6,0(3)
73 addi 3,3,4
740:
75 clrldi 10,12,61 /* Check alignment of SRC again. */
76 srdi 9,31,3 /* Number of full doublewords remaining. */
77
78L(copy_GE_32_aligned_cont):
79
80 clrldi 11,31,61
81 mtcrf 0x01,9
82
83 srdi 8,31,5
84 cmpldi cr1,9,4
85 cmpldi cr6,11,0
86 mr 11,12
87
88 /* Copy 1~3 doublewords so the main loop starts
89 at a multiple of 32 bytes. */
90
91 bf 30,1f
92 ld 6,0(12)
93 ld 7,8(12)
94 addi 11,12,16
95 mtctr 8
96 std 6,0(3)
97 std 7,8(3)
98 addi 10,3,16
99 bf 31,4f
100 ld 0,16(12)
101 std 0,16(3)
102 blt cr1,3f
103 addi 11,12,24
104 addi 10,3,24
105 b 4f
106
107 .align 4
1081: /* Copy 1 doubleword and set the counter. */
109 mr 10,3
110 mtctr 8
111 bf 31,4f
112 ld 6,0(12)
113 addi 11,12,8
114 std 6,0(3)
115 addi 10,3,8
116
117 /* Main aligned copy loop. Copies 32-bytes at a time. */
118 .align 4
1194:
120 ld 6,0(11)
121 ld 7,8(11)
122 ld 8,16(11)
123 ld 0,24(11)
124 addi 11,11,32
125
126 std 6,0(10)
127 std 7,8(10)
128 std 8,16(10)
129 std 0,24(10)
130 addi 10,10,32
131 bdnz 4b
1323:
133
134 /* Check for tail bytes. */
135 rldicr 0,31,0,60
136 mtcrf 0x01,31
137 beq cr6,0f
138
139.L9:
140 add 3,3,0
141 add 12,12,0
142
143 /* At this point we have a tail of 0-7 bytes and we know that the
144 destination is doubleword-aligned. */
1454: /* Copy 4 bytes. */
146 bf 29,2f
147
148 lwz 6,0(12)
149 addi 12,12,4
150 stw 6,0(3)
151 addi 3,3,4
1522: /* Copy 2 bytes. */
153 bf 30,1f
154
155 lhz 6,0(12)
156 addi 12,12,2
157 sth 6,0(3)
158 addi 3,3,2
1591: /* Copy 1 byte. */
160 bf 31,0f
161
162 lbz 6,0(12)
163 stb 6,0(3)
1640: /* Return DST + LEN pointer. */
165 ld 31,-8(1)
166 ld 3,-16(1)
167 add 3,3,5
168 blr
169
170 /* Handle copies of 0~31 bytes. */
171 .align 4
172L(copy_LT_32):
173 cmpldi cr6,5,8
174 mr 12,4
175 mtcrf 0x01,5
176 ble cr6,L(copy_LE_8)
177
178 /* At least 9 bytes to go. */
179 neg 8,4
180 clrrdi 11,4,2
181 andi. 0,8,3
182 cmpldi cr1,5,16
183 mr 10,5
184 beq L(copy_LT_32_aligned)
185
186 /* Force 4-bytes alignment for SRC. */
187 mtocrf 0x01,0
188 subf 10,0,5
1892: bf 30,1f
190
191 lhz 6,0(12)
192 addi 12,12,2
193 sth 6,0(3)
194 addi 3,3,2
1951: bf 31,L(end_4bytes_alignment)
196
197 lbz 6,0(12)
198 addi 12,12,1
199 stb 6,0(3)
200 addi 3,3,1
201
202 .align 4
203L(end_4bytes_alignment):
204 cmpldi cr1,10,16
205 mtcrf 0x01,10
206
207L(copy_LT_32_aligned):
208 /* At least 6 bytes to go, and SRC is word-aligned. */
209 blt cr1,8f
210
211 /* Copy 16 bytes. */
212 lwz 6,0(12)
213 lwz 7,4(12)
214 stw 6,0(3)
215 lwz 8,8(12)
216 stw 7,4(3)
217 lwz 6,12(12)
218 addi 12,12,16
219 stw 8,8(3)
220 stw 6,12(3)
221 addi 3,3,16
2228: /* Copy 8 bytes. */
223 bf 28,4f
224
225 lwz 6,0(12)
226 lwz 7,4(12)
227 addi 12,12,8
228 stw 6,0(3)
229 stw 7,4(3)
230 addi 3,3,8
2314: /* Copy 4 bytes. */
232 bf 29,2f
233
234 lwz 6,0(12)
235 addi 12,12,4
236 stw 6,0(3)
237 addi 3,3,4
2382: /* Copy 2-3 bytes. */
239 bf 30,1f
240
241 lhz 6,0(12)
242 sth 6,0(3)
243 bf 31,0f
244 lbz 7,2(12)
245 stb 7,2(3)
246 ld 3,-16(1)
247 add 3,3,5
248 blr
249
250 .align 4
2511: /* Copy 1 byte. */
252 bf 31,0f
253
254 lbz 6,0(12)
255 stb 6,0(3)
2560: /* Return DST + LEN pointer. */
257 ld 3,-16(1)
258 add 3,3,5
259 blr
260
261 /* Handles copies of 0~8 bytes. */
262 .align 4
263L(copy_LE_8):
264 bne cr6,4f
265
266 /* Though we could've used ld/std here, they are still
267 slow for unaligned cases. */
268
269 lwz 6,0(4)
270 lwz 7,4(4)
271 stw 6,0(3)
272 stw 7,4(3)
273 ld 3,-16(1) /* Return DST + LEN pointer. */
274 add 3,3,5
275 blr
276
277 .align 4
2784: /* Copies 4~7 bytes. */
279 bf 29,2b
280
281 lwz 6,0(4)
282 stw 6,0(3)
283 bf 30,5f
284 lhz 7,4(4)
285 sth 7,4(3)
286 bf 31,0f
287 lbz 8,6(4)
288 stb 8,6(3)
289 ld 3,-16(1)
290 add 3,3,5
291 blr
292
293 .align 4
2945: /* Copy 1 byte. */
295 bf 31,0f
296
297 lbz 6,4(4)
298 stb 6,4(3)
299
3000: /* Return DST + LEN pointer. */
301 ld 3,-16(1)
302 add 3,3,5
303 blr
304
305 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
306 SRC is not. Use aligned quadword loads from SRC, shifted to realign
307 the data, allowing for aligned DST stores. */
308 .align 4
309L(copy_GE_32_unaligned):
310 clrldi 0,0,60 /* Number of bytes until the 1st
311 quadword. */
312 andi. 11,3,15 /* Check alignment of DST (against
313 quadwords). */
314 srdi 9,5,4 /* Number of full quadwords remaining. */
315
316 beq L(copy_GE_32_unaligned_cont)
317
318 /* SRC is not quadword aligned, get it aligned. */
319
320 mtcrf 0x01,0
321 subf 31,0,5
322
323 /* Vector instructions work best when proper alignment (16-bytes)
324 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3251: /* Copy 1 byte. */
326 bf 31,2f
327
328 lbz 6,0(12)
329 addi 12,12,1
330 stb 6,0(3)
331 addi 3,3,1
3322: /* Copy 2 bytes. */
333 bf 30,4f
334
335 lhz 6,0(12)
336 addi 12,12,2
337 sth 6,0(3)
338 addi 3,3,2
3394: /* Copy 4 bytes. */
340 bf 29,8f
341
342 lwz 6,0(12)
343 addi 12,12,4
344 stw 6,0(3)
345 addi 3,3,4
3468: /* Copy 8 bytes. */
347 bf 28,0f
348
349 ld 6,0(12)
350 addi 12,12,8
351 std 6,0(3)
352 addi 3,3,8
3530:
354 clrldi 10,12,60 /* Check alignment of SRC. */
355 srdi 9,31,4 /* Number of full quadwords remaining. */
356
357 /* The proper alignment is present, it is OK to copy the bytes now. */
358L(copy_GE_32_unaligned_cont):
359
360 /* Setup two indexes to speed up the indexed vector operations. */
361 clrldi 11,31,60
362 li 6,16 /* Index for 16-bytes offsets. */
363 li 7,32 /* Index for 32-bytes offsets. */
364 cmpldi cr1,11,0
365 srdi 8,31,5 /* Setup the loop counter. */
366 mr 10,3
367 mr 11,12
368 mtcrf 0x01,9
369 cmpldi cr6,9,1
370#ifdef __LITTLE_ENDIAN__
371 lvsr 5,0,12
372#else
373 lvsl 5,0,12
374#endif
375 lvx 3,0,12
376 bf 31,L(setup_unaligned_loop)
377
378 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
379 lvx 4,12,6
380#ifdef __LITTLE_ENDIAN__
381 vperm 6,4,3,5
382#else
383 vperm 6,3,4,5
384#endif
385 addi 11,12,16
386 addi 10,3,16
387 stvx 6,0,3
388 vor 3,4,4
389
390L(setup_unaligned_loop):
391 mtctr 8
392 ble cr6,L(end_unaligned_loop)
393
394 /* Copy 32 bytes at a time using vector instructions. */
395 .align 4
396L(unaligned_loop):
397
398 /* Note: vr6/vr10 may contain data that was already copied,
399 but in order to get proper alignment, we may have to copy
400 some portions again. This is faster than having unaligned
401 vector instructions though. */
402
403 lvx 4,11,6 /* vr4 = r11+16. */
404#ifdef __LITTLE_ENDIAN__
405 vperm 6,4,3,5
406#else
407 vperm 6,3,4,5
408#endif
409 lvx 3,11,7 /* vr3 = r11+32. */
410#ifdef __LITTLE_ENDIAN__
411 vperm 10,3,4,5
412#else
413 vperm 10,4,3,5
414#endif
415 addi 11,11,32
416 stvx 6,0,10
417 stvx 10,10,6
418 addi 10,10,32
419
420 bdnz L(unaligned_loop)
421
422 .align 4
423L(end_unaligned_loop):
424
425 /* Check for tail bytes. */
426 rldicr 0,31,0,59
427 mtcrf 0x01,31
428 beq cr1,0f
429
430 add 3,3,0
431 add 12,12,0
432
433 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
4348: /* Copy 8 bytes. */
435 bf 28,4f
436
437 lwz 6,0(12)
438 lwz 7,4(12)
439 addi 12,12,8
440 stw 6,0(3)
441 stw 7,4(3)
442 addi 3,3,8
4434: /* Copy 4 bytes. */
444 bf 29,2f
445
446 lwz 6,0(12)
447 addi 12,12,4
448 stw 6,0(3)
449 addi 3,3,4
4502: /* Copy 2~3 bytes. */
451 bf 30,1f
452
453 lhz 6,0(12)
454 addi 12,12,2
455 sth 6,0(3)
456 addi 3,3,2
4571: /* Copy 1 byte. */
458 bf 31,0f
459
460 lbz 6,0(12)
461 stb 6,0(3)
4620: /* Return DST + LEN pointer. */
463 ld 31,-8(1)
464 ld 3,-16(1)
465 add 3,3,5
466 blr
467
468END_GEN_TB (MEMPCPY,TB_TOCLESS)
469libc_hidden_def (__mempcpy)
470weak_alias (__mempcpy, mempcpy)
471libc_hidden_builtin_def (mempcpy)
472

source code of glibc/sysdeps/powerpc/powerpc64/power7/mempcpy.S