1/* Optimized memcpy implementation for PowerPC64.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22 Returns 'dst'.
23
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
28 The 64-bit implementations of POWER3 and POWER4 do a reasonable job
29 of handling unaligned load/stores that do not cross 32-byte boundaries.
30
31 Longer moves (>= 32-bytes) justify the effort to get at least the
32 destination doubleword (8-byte) aligned. Further optimization is
33 possible when both source and destination are doubleword aligned.
34 Each case has a optimized unrolled loop. */
35
36#ifndef MEMCPY
37# define MEMCPY memcpy
38#endif
39
40ENTRY_TOCLESS (MEMCPY, 5)
41 CALL_MCOUNT 3
42
43 cmpldi cr1,5,31
44 neg 0,3
45 std 3,-16(1)
46 std 31,-8(1)
47 cfi_offset(31,-8)
48 andi. 11,3,7 /* check alignment of dst. */
49 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
50 clrldi 10,4,61 /* check alignment of src. */
51 cmpldi cr6,5,8
52 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
53 cmpld cr6,10,11
54 mr 12,4
55 srdi 9,5,3 /* Number of full double words remaining. */
56 mtcrf 0x01,0
57 mr 31,5
58 beq .L0
59
60 subf 31,0,5
61 /* Move 0-7 bytes as needed to get the destination doubleword aligned. */
621: bf 31,2f
63 lbz 6,0(12)
64 addi 12,12,1
65 stb 6,0(3)
66 addi 3,3,1
672: bf 30,4f
68 lhz 6,0(12)
69 addi 12,12,2
70 sth 6,0(3)
71 addi 3,3,2
724: bf 29,0f
73 lwz 6,0(12)
74 addi 12,12,4
75 stw 6,0(3)
76 addi 3,3,4
770:
78 clrldi 10,12,61 /* check alignment of src again. */
79 srdi 9,31,3 /* Number of full double words remaining. */
80
81 /* Copy doublewords from source to destination, assuming the
82 destination is aligned on a doubleword boundary.
83
84 At this point we know there are at least 25 bytes left (32-7) to copy.
85 The next step is to determine if the source is also doubleword aligned.
86 If not branch to the unaligned move code at .L6. which uses
87 a load, shift, store strategy.
88
89 Otherwise source and destination are doubleword aligned, and we can
90 the optimized doubleword copy loop. */
91.L0:
92 clrldi 11,31,61
93 mtcrf 0x01,9
94 bne- cr6,.L6 /* If source is not DW aligned. */
95
96 /* Move doublewords where destination and source are DW aligned.
97 Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
98 If the copy is not an exact multiple of 32 bytes, 1-3
99 doublewords are copied as needed to set up the main loop. After
100 the main loop exits there may be a tail of 1-7 bytes. These byte are
101 copied a word/halfword/byte at a time as needed to preserve alignment. */
102
103 srdi 8,31,5
104 cmpldi cr1,9,4
105 cmpldi cr6,11,0
106 mr 11,12
107
108 bf 30,1f
109 ld 6,0(12)
110 ld 7,8(12)
111 addi 11,12,16
112 mtctr 8
113 std 6,0(3)
114 std 7,8(3)
115 addi 10,3,16
116 bf 31,4f
117 ld 0,16(12)
118 std 0,16(3)
119 blt cr1,3f
120 addi 11,12,24
121 addi 10,3,24
122 b 4f
123 .align 4
1241:
125 mr 10,3
126 mtctr 8
127 bf 31,4f
128 ld 6,0(12)
129 addi 11,12,8
130 std 6,0(3)
131 addi 10,3,8
132
133 .align 4
1344:
135 ld 6,0(11)
136 ld 7,8(11)
137 ld 8,16(11)
138 ld 0,24(11)
139 addi 11,11,32
1402:
141 std 6,0(10)
142 std 7,8(10)
143 std 8,16(10)
144 std 0,24(10)
145 addi 10,10,32
146 bdnz 4b
1473:
148
149 rldicr 0,31,0,60
150 mtcrf 0x01,31
151 beq cr6,0f
152.L9:
153 add 3,3,0
154 add 12,12,0
155
156/* At this point we have a tail of 0-7 bytes and we know that the
157 destination is double word aligned. */
1584: bf 29,2f
159 lwz 6,0(12)
160 addi 12,12,4
161 stw 6,0(3)
162 addi 3,3,4
1632: bf 30,1f
164 lhz 6,0(12)
165 addi 12,12,2
166 sth 6,0(3)
167 addi 3,3,2
1681: bf 31,0f
169 lbz 6,0(12)
170 stb 6,0(3)
1710:
172 /* Return original dst pointer. */
173 ld 31,-8(1)
174 ld 3,-16(1)
175 blr
176
177/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
178 bytes. Each case is handled without loops, using binary (1,2,4,8)
179 tests.
180
181 In the short (0-8 byte) case no attempt is made to force alignment
182 of either source or destination. The hardware will handle the
183 unaligned load/stores with small delays for crossing 32- 64-byte, and
184 4096-byte boundaries. Since these short moves are unlikely to be
185 unaligned or cross these boundaries, the overhead to force
186 alignment is not justified.
187
188 The longer (9-31 byte) move is more likely to cross 32- or 64-byte
189 boundaries. Since only loads are sensitive to the 32-/64-byte
190 boundaries it is more important to align the source then the
191 destination. If the source is not already word aligned, we first
192 move 1-3 bytes as needed. Since we are only word aligned we don't
193 use double word load/stores to insure that all loads are aligned.
194 While the destination and stores may still be unaligned, this
195 is only an issue for page (4096 byte boundary) crossing, which
196 should be rare for these short moves. The hardware handles this
197 case automatically with a small delay. */
198
199 .align 4
200.L2:
201 mtcrf 0x01,5
202 neg 8,4
203 clrrdi 11,4,2
204 andi. 0,8,3
205 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
206/* At least 9 bytes left. Get the source word aligned. */
207 cmpldi cr1,5,16
208 mr 10,5
209 mr 12,4
210 cmpldi cr6,0,2
211 beq .L3 /* If the source is already word aligned skip this. */
212/* Copy 1-3 bytes to get source address word aligned. */
213 lwz 6,0(11)
214 subf 10,0,5
215 add 12,4,0
216 blt cr6,5f
217 srdi 7,6,16
218 bgt cr6,3f
219#ifdef __LITTLE_ENDIAN__
220 sth 7,0(3)
221#else
222 sth 6,0(3)
223#endif
224 b 7f
225 .align 4
2263:
227#ifdef __LITTLE_ENDIAN__
228 rotlwi 6,6,24
229 stb 6,0(3)
230 sth 7,1(3)
231#else
232 stb 7,0(3)
233 sth 6,1(3)
234#endif
235 b 7f
236 .align 4
2375:
238#ifdef __LITTLE_ENDIAN__
239 rotlwi 6,6,8
240#endif
241 stb 6,0(3)
2427:
243 cmpldi cr1,10,16
244 add 3,3,0
245 mtcrf 0x01,10
246 .align 4
247.L3:
248/* At least 6 bytes left and the source is word aligned. */
249 blt cr1,8f
25016: /* Move 16 bytes. */
251 lwz 6,0(12)
252 lwz 7,4(12)
253 stw 6,0(3)
254 lwz 6,8(12)
255 stw 7,4(3)
256 lwz 7,12(12)
257 addi 12,12,16
258 stw 6,8(3)
259 stw 7,12(3)
260 addi 3,3,16
2618: /* Move 8 bytes. */
262 bf 28,4f
263 lwz 6,0(12)
264 lwz 7,4(12)
265 addi 12,12,8
266 stw 6,0(3)
267 stw 7,4(3)
268 addi 3,3,8
2694: /* Move 4 bytes. */
270 bf 29,2f
271 lwz 6,0(12)
272 addi 12,12,4
273 stw 6,0(3)
274 addi 3,3,4
2752: /* Move 2-3 bytes. */
276 bf 30,1f
277 lhz 6,0(12)
278 sth 6,0(3)
279 bf 31,0f
280 lbz 7,2(12)
281 stb 7,2(3)
282 ld 3,-16(1)
283 blr
2841: /* Move 1 byte. */
285 bf 31,0f
286 lbz 6,0(12)
287 stb 6,0(3)
2880:
289 /* Return original dst pointer. */
290 ld 3,-16(1)
291 blr
292
293/* Special case to copy 0-8 bytes. */
294 .align 4
295.LE8:
296 mr 12,4
297 bne cr6,4f
298/* Would have liked to use use ld/std here but the 630 processors are
299 slow for load/store doubles that are not at least word aligned.
300 Unaligned Load/Store word execute with only a 1 cycle penalty. */
301 lwz 6,0(4)
302 lwz 7,4(4)
303 stw 6,0(3)
304 stw 7,4(3)
305 /* Return original dst pointer. */
306 ld 3,-16(1)
307 blr
308 .align 4
3094: bf 29,2b
310 lwz 6,0(4)
311 stw 6,0(3)
3126:
313 bf 30,5f
314 lhz 7,4(4)
315 sth 7,4(3)
316 bf 31,0f
317 lbz 8,6(4)
318 stb 8,6(3)
319 ld 3,-16(1)
320 blr
321 .align 4
3225:
323 bf 31,0f
324 lbz 6,4(4)
325 stb 6,4(3)
326 .align 4
3270:
328 /* Return original dst pointer. */
329 ld 3,-16(1)
330 blr
331
332 .align 4
333.L6:
334
335 /* Copy doublewords where the destination is aligned but the source is
336 not. Use aligned doubleword loads from the source, shifted to realign
337 the data, to allow aligned destination stores. */
338 subf 5,10,12
339 andi. 0,9,1
340 cmpldi cr6,11,0
341 sldi 10,10,3
342 mr 11,9
343 mr 4,3
344 ld 6,0(5)
345 ld 7,8(5)
346 subfic 9,10,64
347 beq 2f
348#ifdef __LITTLE_ENDIAN__
349 srd 0,6,10
350#else
351 sld 0,6,10
352#endif
353 cmpldi 11,1
354 mr 6,7
355 addi 4,4,-8
356 addi 11,11,-1
357 b 1f
3582: addi 5,5,8
359 .align 4
360#ifdef __LITTLE_ENDIAN__
3610: srd 0,6,10
362 sld 8,7,9
363#else
3640: sld 0,6,10
365 srd 8,7,9
366#endif
367 cmpldi 11,2
368 ld 6,8(5)
369 or 0,0,8
370 addi 11,11,-2
371 std 0,0(4)
372#ifdef __LITTLE_ENDIAN__
373 srd 0,7,10
3741: sld 8,6,9
375#else
376 sld 0,7,10
3771: srd 8,6,9
378#endif
379 or 0,0,8
380 beq 8f
381 ld 7,16(5)
382 std 0,8(4)
383 addi 5,5,16
384 addi 4,4,16
385 b 0b
386 .align 4
3878:
388 std 0,8(4)
389 rldicr 0,31,0,60
390 mtcrf 0x01,31
391 bne cr6,.L9 /* If the tail is 0 bytes we are done! */
392 /* Return original dst pointer. */
393 ld 31,-8(1)
394 ld 3,-16(1)
395 blr
396END_GEN_TB (MEMCPY,TB_TOCLESS)
397libc_hidden_builtin_def (memcpy)
398

source code of glibc/sysdeps/powerpc/powerpc64/memcpy.S