memcpy.S source code [glibc/sysdeps/powerpc/powerpc64/a2/memcpy.S]

1	/ Optimized memcpy implementation for PowerPC A2.*
2	Copyright (C) 2010-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include <rtld-global-offsets.h>
21
22	#ifndef MEMCPY
23	# define MEMCPY memcpy
24	#endif
25
26	#define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
27	#define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
28
29	.section ".toc","aw"
30	__GLRO_DEF(dl_cache_line_size)
31
32
33	.section ".text"
34	.align `2`
35
36
37	.machine a2
38	ENTRY (MEMCPY, `5`)
39	CALL_MCOUNT `3`
40
41	dcbt `0`,r4 / Prefetch ONE SRC cacheline /
42	cmpldi cr1,r5,`16` / is size < 16 ? /
43	mr r6,r3 / Copy dest reg to r6; /
44	blt+ cr1,L(shortcopy)
45
46
47	/ Big copy (16 bytes or more)*
48
49	Figure out how far to the nearest quadword boundary, or if we are
50	on one already. Also get the cache line size.
51
52	r3 - return value (always)
53	r4 - current source addr
54	r5 - copy length
55	r6 - current dest addr
56	*/
57
58	neg r8,r3 / LS 4 bits = # bytes to 8-byte dest bdry /
59	/ Get the cache line size. /
60	__GLRO (r9, dl_cache_line_size,
61	RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
62	clrldi r8,r8,`64`-`4` / align to 16byte boundary /
63	sub r7,r4,r3 / compute offset to src from dest /
64	cmpldi cr0,r8,`0` / Were we aligned on a 16 byte bdy? /
65	addi r10,r9,-`1` / Cache line mask /
66	beq+ L(dst_aligned)
67
68
69
70	/ Destination is not aligned on quadword boundary. Get us to one.*
71
72	r3 - return value (always)
73	r4 - current source addr
74	r5 - copy length
75	r6 - current dest addr
76	r7 - offset to src from dest
77	r8 - number of bytes to quadword boundary
78	*/
79
80	mtcrf `0x01`,r8 / put #bytes to boundary into cr7 /
81	subf r5,r8,r5 / adjust remaining len /
82
83	bf cr7*`4`+`3`,`1f`
84	lbzx r0,r7,r6 / copy 1 byte addr /
85	stb r0,`0`(r6)
86	addi r6,r6,`1`
87	`1`:
88	bf cr7*`4`+`2`,`2f`
89	lhzx r0,r7,r6 / copy 2 byte addr /
90	sth r0,`0`(r6)
91	addi r6,r6,`2`
92	`2`:
93	bf cr7*`4`+`1`,`4f`
94	lwzx r0,r7,r6 / copy 4 byte addr /
95	stw r0,`0`(r6)
96	addi r6,r6,`4`
97	`4`:
98	bf cr7*`4`+`0`,`8f`
99	ldx r0,r7,r6 / copy 8 byte addr /
100	std r0,`0`(r6)
101	addi r6,r6,`8`
102	`8`:
103	add r4,r7,r6 / update src addr /
104
105
106
107	/ Dest is quadword aligned now.*
108
109	Lots of decisions to make. If we are copying less than a cache
110	line we won't be here long. If we are not on a cache line
111	boundary we need to get there. And then we need to figure out
112	how many cache lines ahead to pre-touch.
113
114	r3 - return value (always)
115	r4 - current source addr
116	r5 - copy length
117	r6 - current dest addr
118	*/
119
120
121	.align `4`
122	L(dst_aligned):
123	cmpdi cr0,r9,`0` / Cache line size set? /
124	bne+ cr0,L(cachelineset)
125
126	/ Cache line size not set: generic byte copy without much optimization /
127	clrldi. r0,r5,`63` / If length is odd copy one byte /
128	beq L(cachelinenotset_align)
129	lbz r7,`0`(r4) / Read one byte from source /
130	addi r5,r5,-`1` / Update length /
131	addi r4,r4,`1` / Update source pointer address /
132	stb r7,`0`(r6) / Store one byte at dest /
133	addi r6,r6,`1` / Update dest pointer address /
134	L(cachelinenotset_align):
135	cmpdi cr7,r5,`0` / If length is 0 return /
136	beqlr cr7
137	ori r2,r2,`0` / Force a new dispatch group /
138	L(cachelinenotset_loop):
139	addic. r5,r5,-`2` / Update length /
140	lbz r7,`0`(r4) / Load 2 bytes from source /
141	lbz r8,`1`(r4)
142	addi r4,r4,`2` / Update source pointer address /
143	stb r7,`0`(r6) / Store 2 bytes on dest /
144	stb r8,`1`(r6)
145	addi r6,r6,`2` / Update dest pointer address /
146	bne L(cachelinenotset_loop)
147	blr
148
149
150	L(cachelineset):
151	cmpd cr5,r5,r10 / Less than a cacheline to go? /
152
153	neg r7,r6 / How far to next cacheline bdy? /
154
155	addi r6,r6,-`8` / prepare for stdu /
156	cmpdi cr0,r9,`128`
157	addi r4,r4,-`8` / prepare for ldu /
158
159
160	ble+ cr5,L(lessthancacheline)
161
162	beq- cr0,L(big_lines) / 128 byte line code /
163
164
165
166	/ More than a cacheline left to go, and using 64 byte cachelines /
167
168	clrldi r7,r7,`64`-`6` / How far to next cacheline bdy? /
169
170	cmpldi cr6,r7,`0` / Are we on a cacheline bdy already? /
171
172	/ Reduce total len by what it takes to get to the next cache line /
173	subf r5,r7,r5
174	srdi r7,r7,`4` / How many qws to get to the line bdy? /
175
176	/ How many full cache lines to copy after getting to a line bdy? /
177	srdi r10,r5,`6`
178
179	cmpldi r10,`0` / If no full cache lines to copy ... /
180	li r11,`0` / number cachelines to copy with prefetch /
181	beq L(nocacheprefetch)
182
183
184	/ We are here because we have at least one full cache line to copy,*
185	and therefore some pre-touching to do. /*
186
187	cmpldi r10,PREFETCH_AHEAD
188	li r12,`64`+`8` / prefetch distance /
189	ble L(lessthanmaxprefetch)
190
191	/ We can only do so much pre-fetching. R11 will have the count of*
192	lines left to prefetch after the initial batch of prefetches
193	are executed. /*
194
195	subi r11,r10,PREFETCH_AHEAD
196	li r10,PREFETCH_AHEAD
197
198	L(lessthanmaxprefetch):
199	mtctr r10
200
201	/ At this point r10/ctr hold the number of lines to prefetch in this*
202	initial batch, and r11 holds any remainder. /*
203
204	L(prefetchSRC):
205	dcbt r12,r4
206	addi r12,r12,`64`
207	bdnz L(prefetchSRC)
208
209
210	/ Prefetching is done, or was not needed.*
211
212	cr6 - are we on a cacheline boundary already?
213	r7 - number of quadwords to the next cacheline boundary
214	*/
215
216	L(nocacheprefetch):
217	mtctr r7
218
219	cmpldi cr1,r5,`64` / Less than a cache line to copy? /
220
221	/ How many bytes are left after we copy whatever full*
222	cache lines we can get? /*
223	clrldi r5,r5,`64`-`6`
224
225	beq cr6,L(cachelinealigned)
226
227
228	/ Copy quadwords up to the next cacheline boundary /
229
230	L(aligntocacheline):
231	ld r9,`0x08`(r4)
232	ld r7,`0x10`(r4)
233	addi r4,r4,`0x10`
234	std r9,`0x08`(r6)
235	stdu r7,`0x10`(r6)
236	bdnz L(aligntocacheline)
237
238
239	.align `4`
240	L(cachelinealigned): / copy while cache lines /
241
242	blt- cr1,L(lessthancacheline) / size <64 /
243
244	L(outerloop):
245	cmpdi r11,`0`
246	mtctr r11
247	beq- L(endloop)
248
249	li r11,`64`ZERO_AHEAD +`8` /* DCBZ dist /
250
251	.align `4`
252	/ Copy whole cachelines, optimized by prefetching SRC cacheline /
253	L(loop): / Copy aligned body /
254	dcbt r12,r4 / PREFETCH SOURCE some cache lines ahead /
255	ld r9, `0x08`(r4)
256	dcbz r11,r6
257	ld r7, `0x10`(r4)
258	ld r8, `0x18`(r4)
259	ld r0, `0x20`(r4)
260	std r9, `0x08`(r6)
261	std r7, `0x10`(r6)
262	std r8, `0x18`(r6)
263	std r0, `0x20`(r6)
264	ld r9, `0x28`(r4)
265	ld r7, `0x30`(r4)
266	ld r8, `0x38`(r4)
267	ld r0, `0x40`(r4)
268	addi r4, r4,`0x40`
269	std r9, `0x28`(r6)
270	std r7, `0x30`(r6)
271	std r8, `0x38`(r6)
272	stdu r0, `0x40`(r6)
273
274	bdnz L(loop)
275
276
277	L(endloop):
278	cmpdi r10,`0`
279	beq- L(endloop2)
280	mtctr r10
281
282	L(loop2): / Copy aligned body /
283	ld r9, `0x08`(r4)
284	ld r7, `0x10`(r4)
285	ld r8, `0x18`(r4)
286	ld r0, `0x20`(r4)
287	std r9, `0x08`(r6)
288	std r7, `0x10`(r6)
289	std r8, `0x18`(r6)
290	std r0, `0x20`(r6)
291	ld r9, `0x28`(r4)
292	ld r7, `0x30`(r4)
293	ld r8, `0x38`(r4)
294	ld r0, `0x40`(r4)
295	addi r4, r4,`0x40`
296	std r9, `0x28`(r6)
297	std r7, `0x30`(r6)
298	std r8, `0x38`(r6)
299	stdu r0, `0x40`(r6)
300
301	bdnz L(loop2)
302	L(endloop2):
303
304
305	.align `4`
306	L(lessthancacheline): / Was there less than cache to do ? /
307	cmpldi cr0,r5,`16`
308	srdi r7,r5,`4` / divide size by 16 /
309	blt- L(do_lt16)
310	mtctr r7
311
312	L(copy_remaining):
313	ld r8,`0x08`(r4)
314	ld r7,`0x10`(r4)
315	addi r4,r4,`0x10`
316	std r8,`0x08`(r6)
317	stdu r7,`0x10`(r6)
318	bdnz L(copy_remaining)
319
320	L(do_lt16): / less than 16 ? /
321	cmpldi cr0,r5,`0` / copy remaining bytes (0-15) /
322	beqlr+ / no rest to copy /
323	addi r4,r4,`8`
324	addi r6,r6,`8`
325
326	L(shortcopy): / SIMPLE COPY to handle size =< 15 bytes /
327	mtcrf `0x01`,r5
328	sub r7,r4,r6
329	bf- cr7*`4`+`0`,`8f`
330	ldx r0,r7,r6 / copy 8 byte /
331	std r0,`0`(r6)
332	addi r6,r6,`8`
333	`8`:
334	bf cr7*`4`+`1`,`4f`
335	lwzx r0,r7,r6 / copy 4 byte /
336	stw r0,`0`(r6)
337	addi r6,r6,`4`
338	`4`:
339	bf cr7*`4`+`2`,`2f`
340	lhzx r0,r7,r6 / copy 2 byte /
341	sth r0,`0`(r6)
342	addi r6,r6,`2`
343	`2`:
344	bf cr7*`4`+`3`,`1f`
345	lbzx r0,r7,r6 / copy 1 byte /
346	stb r0,`0`(r6)
347	`1`:
348	blr
349
350
351
352
353
354	/ Similar to above, but for use with 128 byte lines. /
355
356
357	L(big_lines):
358
359	clrldi r7,r7,`64`-`7` / How far to next cacheline bdy? /
360
361	cmpldi cr6,r7,`0` / Are we on a cacheline bdy already? /
362
363	/ Reduce total len by what it takes to get to the next cache line /
364	subf r5,r7,r5
365	srdi r7,r7,`4` / How many qws to get to the line bdy? /
366
367	/ How many full cache lines to copy after getting to a line bdy? /
368	srdi r10,r5,`7`
369
370	cmpldi r10,`0` / If no full cache lines to copy ... /
371	li r11,`0` / number cachelines to copy with prefetch /
372	beq L(nocacheprefetch_128)
373
374
375	/ We are here because we have at least one full cache line to copy,*
376	and therefore some pre-touching to do. /*
377
378	cmpldi r10,PREFETCH_AHEAD
379	li r12,`128`+`8` / prefetch distance /
380	ble L(lessthanmaxprefetch_128)
381
382	/ We can only do so much pre-fetching. R11 will have the count of*
383	lines left to prefetch after the initial batch of prefetches
384	are executed. /*
385
386	subi r11,r10,PREFETCH_AHEAD
387	li r10,PREFETCH_AHEAD
388
389	L(lessthanmaxprefetch_128):
390	mtctr r10
391
392	/ At this point r10/ctr hold the number of lines to prefetch in this*
393	initial batch, and r11 holds any remainder. /*
394
395	L(prefetchSRC_128):
396	dcbt r12,r4
397	addi r12,r12,`128`
398	bdnz L(prefetchSRC_128)
399
400
401	/ Prefetching is done, or was not needed.*
402
403	cr6 - are we on a cacheline boundary already?
404	r7 - number of quadwords to the next cacheline boundary
405	*/
406
407	L(nocacheprefetch_128):
408	mtctr r7
409
410	cmpldi cr1,r5,`128` / Less than a cache line to copy? /
411
412	/ How many bytes are left after we copy whatever full*
413	cache lines we can get? /*
414	clrldi r5,r5,`64`-`7`
415
416	beq cr6,L(cachelinealigned_128)
417
418
419	/ Copy quadwords up to the next cacheline boundary /
420
421	L(aligntocacheline_128):
422	ld r9,`0x08`(r4)
423	ld r7,`0x10`(r4)
424	addi r4,r4,`0x10`
425	std r9,`0x08`(r6)
426	stdu r7,`0x10`(r6)
427	bdnz L(aligntocacheline_128)
428
429
430	L(cachelinealigned_128): / copy while cache lines /
431
432	blt- cr1,L(lessthancacheline) / size <128 /
433
434	L(outerloop_128):
435	cmpdi r11,`0`
436	mtctr r11
437	beq- L(endloop_128)
438
439	li r11,`128`ZERO_AHEAD +`8` /* DCBZ dist /
440
441	.align `4`
442	/ Copy whole cachelines, optimized by prefetching SRC cacheline /
443	L(loop_128): / Copy aligned body /
444	dcbt r12,r4 / PREFETCH SOURCE some cache lines ahead /
445	ld r9, `0x08`(r4)
446	dcbz r11,r6
447	ld r7, `0x10`(r4)
448	ld r8, `0x18`(r4)
449	ld r0, `0x20`(r4)
450	std r9, `0x08`(r6)
451	std r7, `0x10`(r6)
452	std r8, `0x18`(r6)
453	std r0, `0x20`(r6)
454	ld r9, `0x28`(r4)
455	ld r7, `0x30`(r4)
456	ld r8, `0x38`(r4)
457	ld r0, `0x40`(r4)
458	std r9, `0x28`(r6)
459	std r7, `0x30`(r6)
460	std r8, `0x38`(r6)
461	std r0, `0x40`(r6)
462	ld r9, `0x48`(r4)
463	ld r7, `0x50`(r4)
464	ld r8, `0x58`(r4)
465	ld r0, `0x60`(r4)
466	std r9, `0x48`(r6)
467	std r7, `0x50`(r6)
468	std r8, `0x58`(r6)
469	std r0, `0x60`(r6)
470	ld r9, `0x68`(r4)
471	ld r7, `0x70`(r4)
472	ld r8, `0x78`(r4)
473	ld r0, `0x80`(r4)
474	addi r4, r4,`0x80`
475	std r9, `0x68`(r6)
476	std r7, `0x70`(r6)
477	std r8, `0x78`(r6)
478	stdu r0, `0x80`(r6)
479
480	bdnz L(loop_128)
481
482
483	L(endloop_128):
484	cmpdi r10,`0`
485	beq- L(endloop2_128)
486	mtctr r10
487
488	L(loop2_128): / Copy aligned body /
489	ld r9, `0x08`(r4)
490	ld r7, `0x10`(r4)
491	ld r8, `0x18`(r4)
492	ld r0, `0x20`(r4)
493	std r9, `0x08`(r6)
494	std r7, `0x10`(r6)
495	std r8, `0x18`(r6)
496	std r0, `0x20`(r6)
497	ld r9, `0x28`(r4)
498	ld r7, `0x30`(r4)
499	ld r8, `0x38`(r4)
500	ld r0, `0x40`(r4)
501	std r9, `0x28`(r6)
502	std r7, `0x30`(r6)
503	std r8, `0x38`(r6)
504	std r0, `0x40`(r6)
505	ld r9, `0x48`(r4)
506	ld r7, `0x50`(r4)
507	ld r8, `0x58`(r4)
508	ld r0, `0x60`(r4)
509	std r9, `0x48`(r6)
510	std r7, `0x50`(r6)
511	std r8, `0x58`(r6)
512	std r0, `0x60`(r6)
513	ld r9, `0x68`(r4)
514	ld r7, `0x70`(r4)
515	ld r8, `0x78`(r4)
516	ld r0, `0x80`(r4)
517	addi r4, r4,`0x80`
518	std r9, `0x68`(r6)
519	std r7, `0x70`(r6)
520	std r8, `0x78`(r6)
521	stdu r0, `0x80`(r6)
522
523	bdnz L(loop2_128)
524	L(endloop2_128):
525
526	b L(lessthancacheline)
527
528
529	END_GEN_TB (MEMCPY,TB_TOCLESS)
530	libc_hidden_builtin_def (memcpy)
531

source code of glibc/sysdeps/powerpc/powerpc64/a2/memcpy.S