memcpy.S source code [glibc/sysdeps/powerpc/powerpc32/a2/memcpy.S]

1	/ Optimized memcpy implementation for PowerPC A2.*
2	Copyright (C) 2010-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include <rtld-global-offsets.h>
21
22	#define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
23	#define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
24
25	.machine a2
26	EALIGN (memcpy, `5`, `0`)
27	CALL_MCOUNT
28
29	dcbt `0`,r4 / Prefetch ONE SRC cacheline /
30	cmplwi cr1,r5,`16` / is size < 16 ? /
31	mr r6,r3 / Copy dest reg to r6; /
32	blt+ cr1,L(shortcopy)
33
34
35	/ Big copy (16 bytes or more)*
36
37	Figure out how far to the nearest quadword boundary, or if we are
38	on one already.
39
40	r3 - return value (always)
41	r4 - current source addr
42	r5 - copy length
43	r6 - current dest addr
44	*/
45
46	neg r8,r3 / LS 4 bits = # bytes to 8-byte dest bdry /
47	clrlwi r8,r8,`32`-`4` / align to 16byte boundary /
48	sub r7,r4,r3 / compute offset to src from dest /
49	cmplwi cr0,r8,`0` / Were we aligned on a 16 byte bdy? /
50	beq+ L(dst_aligned)
51
52
53
54	/ Destination is not aligned on quadword boundary. Get us to one.*
55
56	r3 - return value (always)
57	r4 - current source addr
58	r5 - copy length
59	r6 - current dest addr
60	r7 - offset to src from dest
61	r8 - number of bytes to quadword boundary
62	*/
63
64	mtcrf `0x01`,r8 / put #bytes to boundary into cr7 /
65	subf r5,r8,r5 / adjust remaining len /
66
67	bf cr7*`4`+`3`,`1f`
68	lbzx r0,r7,r6 / copy 1 byte addr /
69	stb r0,`0`(r6)
70	addi r6,r6,`1`
71	`1`:
72	bf cr7*`4`+`2`,`2f`
73	lhzx r0,r7,r6 / copy 2 byte addr /
74	sth r0,`0`(r6)
75	addi r6,r6,`2`
76	`2`:
77	bf cr7*`4`+`1`,`4f`
78	lwzx r0,r7,r6 / copy 4 byte addr /
79	stw r0,`0`(r6)
80	addi r6,r6,`4`
81	`4`:
82	bf cr7*`4`+`0`,`8f`
83	lfdx r0,r7,r6 / copy 8 byte addr /
84	stfd r0,`0`(r6)
85	addi r6,r6,`8`
86	`8`:
87	add r4,r7,r6 / update src addr /
88
89
90
91	/ Dest is quadword aligned now.*
92
93	Lots of decisions to make. If we are copying less than a cache
94	line we won't be here long. If we are not on a cache line
95	boundary we need to get there. And then we need to figure out
96	how many cache lines ahead to pre-touch.
97
98	r3 - return value (always)
99	r4 - current source addr
100	r5 - copy length
101	r6 - current dest addr
102	*/
103
104
105	.align `4`
106	L(dst_aligned):
107
108
109	#ifdef PIC
110	mflr r0
111	/ Establishes GOT addressability so we can load the cache line size*
112	from rtld_global_ro. This value was set from the aux vector during
113	startup. /*
114	SETUP_GOT_ACCESS(r9,got_label)
115	addis r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@ha
116	addi r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@l
117	mtlr r0
118	#endif
119	__GLRO(r9, r9, _dl_cache_line_size,
120	RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
121
122	cmplwi cr5, r9, `0`
123	bne+ cr5,L(cachelineset)
124
125	/ Cache line size not set: generic byte copy without much optimization /
126	andi. r0,r5,`1` / If length is odd copy one byte. /
127	beq L(cachelinenotset_align)
128	lbz r7,`0`(r4) / Read one byte from source. /
129	addi r5,r5,-`1` / Update length. /
130	addi r4,r4,`1` / Update source pointer address. /
131	stb r7,`0`(r6) / Store one byte on dest. /
132	addi r6,r6,`1` / Update dest pointer address. /
133	L(cachelinenotset_align):
134	cmpwi cr7,r5,`0` / If length is 0 return. /
135	beqlr cr7
136	ori r2,r2,`0` / Force a new dispatch group. /
137	L(cachelinenotset_loop):
138	addic. r5,r5,-`2` / Update length. /
139	lbz r7,`0`(r4) / Load 2 bytes from source. /
140	lbz r8,`1`(r4)
141	addi r4,r4,`2` / Update source pointer address. /
142	stb r7,`0`(r6) / Store 2 bytes on dest. /
143	stb r8,`1`(r6)
144	addi r6,r6,`2` / Update dest pointer address. /
145	bne L(cachelinenotset_loop)
146	blr
147
148
149	L(cachelineset):
150
151	addi r10,r9,-`1`
152
153	cmpw cr5,r5,r10 / Less than a cacheline to go? /
154
155	neg r7,r6 / How far to next cacheline bdy? /
156
157	addi r6,r6,-`8` / prepare for stdu /
158	cmpwi cr0,r9,`128`
159	addi r4,r4,-`8` / prepare for ldu /
160
161
162	ble+ cr5,L(lessthancacheline)
163
164	beq- cr0,L(big_lines) / 128 byte line code /
165
166
167
168
169	/ More than a cacheline left to go, and using 64 byte cachelines /
170
171	clrlwi r7,r7,`32`-`6` / How far to next cacheline bdy? /
172
173	cmplwi cr6,r7,`0` / Are we on a cacheline bdy already? /
174
175	/ Reduce total len by what it takes to get to the next cache line /
176	subf r5,r7,r5
177	srwi r7,r7,`4` / How many qws to get to the line bdy? /
178
179	/ How many full cache lines to copy after getting to a line bdy? /
180	srwi r10,r5,`6`
181
182	cmplwi r10,`0` / If no full cache lines to copy ... /
183	li r11,`0` / number cachelines to copy with prefetch /
184	beq L(nocacheprefetch)
185
186
187	/ We are here because we have at least one full cache line to copy,*
188	and therefore some pre-touching to do. /*
189
190	cmplwi r10,PREFETCH_AHEAD
191	li r12,`64`+`8` / prefetch distance /
192	ble L(lessthanmaxprefetch)
193
194	/ We can only do so much pre-fetching. R11 will have the count of*
195	lines left to prefetch after the initial batch of prefetches
196	are executed. /*
197
198	subi r11,r10,PREFETCH_AHEAD
199	li r10,PREFETCH_AHEAD
200
201	L(lessthanmaxprefetch):
202	mtctr r10
203
204	/ At this point r10/ctr hold the number of lines to prefetch in this*
205	initial batch, and r11 holds any remainder. /*
206
207	L(prefetchSRC):
208	dcbt r12,r4
209	addi r12,r12,`64`
210	bdnz L(prefetchSRC)
211
212
213	/ Prefetching is done, or was not needed.*
214
215	cr6 - are we on a cacheline boundary already?
216	r7 - number of quadwords to the next cacheline boundary
217	*/
218
219	L(nocacheprefetch):
220	mtctr r7
221
222	cmplwi cr1,r5,`64` / Less than a cache line to copy? /
223
224	/ How many bytes are left after we copy whatever full*
225	cache lines we can get? /*
226	clrlwi r5,r5,`32`-`6`
227
228	beq cr6,L(cachelinealigned)
229
230
231	/ Copy quadwords up to the next cacheline boundary /
232
233	L(aligntocacheline):
234	lfd fp9,`0x08`(r4)
235	lfdu fp10,`0x10`(r4)
236	stfd fp9,`0x08`(r6)
237	stfdu fp10,`0x10`(r6)
238	bdnz L(aligntocacheline)
239
240
241	.align `4`
242	L(cachelinealigned): / copy while cache lines /
243
244	blt- cr1,L(lessthancacheline) / size <64 /
245
246	L(outerloop):
247	cmpwi r11,`0`
248	mtctr r11
249	beq- L(endloop)
250
251	li r11,`64`ZERO_AHEAD +`8` /* DCBZ dist /
252
253	.align `4`
254	/ Copy whole cachelines, optimized by prefetching SRC cacheline /
255	L(loop): / Copy aligned body /
256	dcbt r12,r4 / PREFETCH SOURCE some cache lines ahead /
257	lfd fp9, `0x08`(r4)
258	dcbz r11,r6
259	lfd fp10, `0x10`(r4)
260	lfd fp11, `0x18`(r4)
261	lfd fp12, `0x20`(r4)
262	stfd fp9, `0x08`(r6)
263	stfd fp10, `0x10`(r6)
264	stfd fp11, `0x18`(r6)
265	stfd fp12, `0x20`(r6)
266	lfd fp9, `0x28`(r4)
267	lfd fp10, `0x30`(r4)
268	lfd fp11, `0x38`(r4)
269	lfdu fp12, `0x40`(r4)
270	stfd fp9, `0x28`(r6)
271	stfd fp10, `0x30`(r6)
272	stfd fp11, `0x38`(r6)
273	stfdu fp12, `0x40`(r6)
274
275	bdnz L(loop)
276
277
278	L(endloop):
279	cmpwi r10,`0`
280	beq- L(endloop2)
281	mtctr r10
282
283	L(loop2): / Copy aligned body /
284	lfd fp9, `0x08`(r4)
285	lfd fp10, `0x10`(r4)
286	lfd fp11, `0x18`(r4)
287	lfd fp12, `0x20`(r4)
288	stfd fp9, `0x08`(r6)
289	stfd fp10, `0x10`(r6)
290	stfd fp11, `0x18`(r6)
291	stfd fp12, `0x20`(r6)
292	lfd fp9, `0x28`(r4)
293	lfd fp10, `0x30`(r4)
294	lfd fp11, `0x38`(r4)
295	lfdu fp12, `0x40`(r4)
296	stfd fp9, `0x28`(r6)
297	stfd fp10, `0x30`(r6)
298	stfd fp11, `0x38`(r6)
299	stfdu fp12, `0x40`(r6)
300
301	bdnz L(loop2)
302	L(endloop2):
303
304
305	.align `4`
306	L(lessthancacheline): / Was there less than cache to do ? /
307	cmplwi cr0,r5,`16`
308	srwi r7,r5,`4` / divide size by 16 /
309	blt- L(do_lt16)
310	mtctr r7
311
312	L(copy_remaining):
313	lfd fp9, `0x08`(r4)
314	lfdu fp10, `0x10`(r4)
315	stfd fp9, `0x08`(r6)
316	stfdu fp10, `0x10`(r6)
317	bdnz L(copy_remaining)
318
319	L(do_lt16): / less than 16 ? /
320	cmplwi cr0,r5,`0` / copy remaining bytes (0-15) /
321	beqlr+ / no rest to copy /
322	addi r4,r4,`8`
323	addi r6,r6,`8`
324
325	L(shortcopy): / SIMPLE COPY to handle size =< 15 bytes /
326	mtcrf `0x01`,r5
327	sub r7,r4,r6
328	bf- cr7*`4`+`0`,`8f`
329	lfdx fp9,r7,r6 / copy 8 byte /
330	stfd fp9,`0`(r6)
331	addi r6,r6,`8`
332	`8`:
333	bf cr7*`4`+`1`,`4f`
334	lwzx r0,r7,r6 / copy 4 byte /
335	stw r0,`0`(r6)
336	addi r6,r6,`4`
337	`4`:
338	bf cr7*`4`+`2`,`2f`
339	lhzx r0,r7,r6 / copy 2 byte /
340	sth r0,`0`(r6)
341	addi r6,r6,`2`
342	`2`:
343	bf cr7*`4`+`3`,`1f`
344	lbzx r0,r7,r6 / copy 1 byte /
345	stb r0,`0`(r6)
346	`1`:
347	blr
348
349
350
351
352
353	/ Similar to above, but for use with 128 byte lines. /
354
355
356	L(big_lines):
357
358	clrlwi r7,r7,`32`-`7` / How far to next cacheline bdy? /
359
360	cmplwi cr6,r7,`0` / Are we on a cacheline bdy already? /
361
362	/ Reduce total len by what it takes to get to the next cache line /
363	subf r5,r7,r5
364	srwi r7,r7,`4` / How many qw to get to the line bdy? /
365
366	/ How many full cache lines to copy after getting to a line bdy? /
367	srwi r10,r5,`7`
368
369	cmplwi r10,`0` / If no full cache lines to copy ... /
370	li r11,`0` / number cachelines to copy with prefetch /
371	beq L(nocacheprefetch_128)
372
373
374	/ We are here because we have at least one full cache line to copy,*
375	and therefore some pre-touching to do. /*
376
377	cmplwi r10,PREFETCH_AHEAD
378	li r12,`128`+`8` / prefetch distance /
379	ble L(lessthanmaxprefetch_128)
380
381	/ We can only do so much pre-fetching. R11 will have the count of*
382	lines left to prefetch after the initial batch of prefetches
383	are executed. /*
384
385	subi r11,r10,PREFETCH_AHEAD
386	li r10,PREFETCH_AHEAD
387
388	L(lessthanmaxprefetch_128):
389	mtctr r10
390
391	/ At this point r10/ctr hold the number of lines to prefetch in this*
392	initial batch, and r11 holds any remainder. /*
393
394	L(prefetchSRC_128):
395	dcbt r12,r4
396	addi r12,r12,`128`
397	bdnz L(prefetchSRC_128)
398
399
400	/ Prefetching is done, or was not needed.*
401
402	cr6 - are we on a cacheline boundary already?
403	r7 - number of quadwords to the next cacheline boundary
404	*/
405
406	L(nocacheprefetch_128):
407	mtctr r7
408
409	cmplwi cr1,r5,`128` / Less than a cache line to copy? /
410
411	/ How many bytes are left after we copy whatever full*
412	cache lines we can get? /*
413	clrlwi r5,r5,`32`-`7`
414
415	beq cr6,L(cachelinealigned_128)
416
417
418	/ Copy quadwords up to the next cacheline boundary /
419
420	L(aligntocacheline_128):
421	lfd fp9,`0x08`(r4)
422	lfdu fp10,`0x10`(r4)
423	stfd fp9,`0x08`(r6)
424	stfdu fp10,`0x10`(r6)
425	bdnz L(aligntocacheline_128)
426
427
428	L(cachelinealigned_128): / copy while cache lines /
429
430	blt- cr1,L(lessthancacheline) / size <128 /
431
432	L(outerloop_128):
433	cmpwi r11,`0`
434	mtctr r11
435	beq- L(endloop_128)
436
437	li r11,`128`ZERO_AHEAD +`8` /* DCBZ dist /
438
439	.align `4`
440	/ Copy whole cachelines, optimized by prefetching SRC cacheline /
441	L(loop_128): / Copy aligned body /
442	dcbt r12,r4 / PREFETCH SOURCE some cache lines ahead /
443	lfd fp9, `0x08`(r4)
444	dcbz r11,r6
445	lfd fp10, `0x10`(r4)
446	lfd fp11, `0x18`(r4)
447	lfd fp12, `0x20`(r4)
448	stfd fp9, `0x08`(r6)
449	stfd fp10, `0x10`(r6)
450	stfd fp11, `0x18`(r6)
451	stfd fp12, `0x20`(r6)
452	lfd fp9, `0x28`(r4)
453	lfd fp10, `0x30`(r4)
454	lfd fp11, `0x38`(r4)
455	lfd fp12, `0x40`(r4)
456	stfd fp9, `0x28`(r6)
457	stfd fp10, `0x30`(r6)
458	stfd fp11, `0x38`(r6)
459	stfd fp12, `0x40`(r6)
460	lfd fp9, `0x48`(r4)
461	lfd fp10, `0x50`(r4)
462	lfd fp11, `0x58`(r4)
463	lfd fp12, `0x60`(r4)
464	stfd fp9, `0x48`(r6)
465	stfd fp10, `0x50`(r6)
466	stfd fp11, `0x58`(r6)
467	stfd fp12, `0x60`(r6)
468	lfd fp9, `0x68`(r4)
469	lfd fp10, `0x70`(r4)
470	lfd fp11, `0x78`(r4)
471	lfdu fp12, `0x80`(r4)
472	stfd fp9, `0x68`(r6)
473	stfd fp10, `0x70`(r6)
474	stfd fp11, `0x78`(r6)
475	stfdu fp12, `0x80`(r6)
476
477	bdnz L(loop_128)
478
479
480	L(endloop_128):
481	cmpwi r10,`0`
482	beq- L(endloop2_128)
483	mtctr r10
484
485	L(loop2_128): / Copy aligned body /
486	lfd fp9, `0x08`(r4)
487	lfd fp10, `0x10`(r4)
488	lfd fp11, `0x18`(r4)
489	lfd fp12, `0x20`(r4)
490	stfd fp9, `0x08`(r6)
491	stfd fp10, `0x10`(r6)
492	stfd fp11, `0x18`(r6)
493	stfd fp12, `0x20`(r6)
494	lfd fp9, `0x28`(r4)
495	lfd fp10, `0x30`(r4)
496	lfd fp11, `0x38`(r4)
497	lfd fp12, `0x40`(r4)
498	stfd fp9, `0x28`(r6)
499	stfd fp10, `0x30`(r6)
500	stfd fp11, `0x38`(r6)
501	stfd fp12, `0x40`(r6)
502	lfd fp9, `0x48`(r4)
503	lfd fp10, `0x50`(r4)
504	lfd fp11, `0x58`(r4)
505	lfd fp12, `0x60`(r4)
506	stfd fp9, `0x48`(r6)
507	stfd fp10, `0x50`(r6)
508	stfd fp11, `0x58`(r6)
509	stfd fp12, `0x60`(r6)
510	lfd fp9, `0x68`(r4)
511	lfd fp10, `0x70`(r4)
512	lfd fp11, `0x78`(r4)
513	lfdu fp12, `0x80`(r4)
514	stfd fp9, `0x68`(r6)
515	stfd fp10, `0x70`(r6)
516	stfd fp11, `0x78`(r6)
517	stfdu fp12, `0x80`(r6)
518	bdnz L(loop2_128)
519	L(endloop2_128):
520
521	b L(lessthancacheline)
522
523
524	END (memcpy)
525	libc_hidden_builtin_def (memcpy)
526

source code of glibc/sysdeps/powerpc/powerpc32/a2/memcpy.S