memcpy.S source code [glibc/sysdeps/powerpc/powerpc32/power4/memcpy.S]

1	/ Optimized memcpy implementation for PowerPC32 on PowerPC64.*
2	Copyright (C) 2003-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ void * [r3] memcpy (void dst [r3], void src [r4], size_t len [r5]);*
22	Returns 'dst'.
23
24	Memcpy handles short copies (< 32-bytes) using a binary move blocks
25	(no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26	with the appropriate combination of byte and halfword load/stores.
27	There is minimal effort to optimize the alignment of short moves.
28
29	Longer moves (>= 32-bytes) justify the effort to get at least the
30	destination word (4-byte) aligned. Further optimization is
31	possible when both source and destination are word aligned.
32	Each case has an optimized unrolled loop. /*
33
34	.machine power4
35	EALIGN (memcpy, `5`, `0`)
36	CALL_MCOUNT
37
38	stwu `1`,-`32`(`1`)
39	cfi_adjust_cfa_offset(`32`)
40	stw `30`,`20`(`1`)
41	cfi_offset(`30`,(`20`-`32`))
42	mr `30`,`3`
43	cmplwi cr1,`5`,`31`
44	stw `31`,`24`(`1`)
45	cfi_offset(`31`,(`24`-`32`))
46	neg `0`,`3`
47	andi. `11`,`3`,`3` / check alignment of dst. /
48	clrlwi `0`,`0`,`30` / Number of bytes until the 1st word of dst. /
49	clrlwi `10`,`4`,`30` / check alignment of src. /
50	cmplwi cr6,`5`,`8`
51	ble- cr1,.L2 / If move < 32 bytes use short move code. /
52	cmplw cr6,`10`,`11`
53	mr `12`,`4`
54	srwi `9`,`5`,`2` / Number of full words remaining. /
55	mtcrf `0x01`,`0`
56	mr `31`,`5`
57	beq .L0
58
59	subf `31`,`0`,`5`
60	/ Move 0-3 bytes as needed to get the destination word aligned. /
61	`1`: bf `31`,`2f`
62	lbz `6`,`0`(`12`)
63	addi `12`,`12`,`1`
64	stb `6`,`0`(`3`)
65	addi `3`,`3`,`1`
66	`2`: bf `30`,`0f`
67	lhz `6`,`0`(`12`)
68	addi `12`,`12`,`2`
69	sth `6`,`0`(`3`)
70	addi `3`,`3`,`2`
71	`0`:
72	clrlwi `10`,`12`,`30` / check alignment of src again. /
73	srwi `9`,`31`,`2` / Number of full words remaining. /
74
75	/ Copy words from source to destination, assuming the destination is*
76	aligned on a word boundary.
77
78	At this point we know there are at least 25 bytes left (32-7) to copy.
79	The next step is to determine if the source is also word aligned.
80	If not branch to the unaligned move code at .L6. which uses
81	a load, shift, store strategy.
82
83	Otherwise source and destination are word aligned, and we can use
84	the optimized word copy loop. /*
85	.L0:
86	clrlwi `11`,`31`,`30` / calculate the number of tail bytes /
87	mtcrf `0x01`,`9`
88	bne- cr6,.L6 / If source is not word aligned. /
89
90	/ Move words where destination and source are word aligned.*
91	Use an unrolled loop to copy 4 words (16-bytes) per iteration.
92	If the copy is not an exact multiple of 16 bytes, 1-3
93	words are copied as needed to set up the main loop. After
94	the main loop exits there may be a tail of 1-3 bytes. These bytes are
95	copied a halfword/byte at a time as needed to preserve alignment. /*
96
97	srwi `8`,`31`,`4` / calculate the 16 byte loop count /
98	cmplwi cr1,`9`,`4`
99	cmplwi cr6,`11`,`0`
100	mr `11`,`12`
101
102	bf `30`,`1f`
103	lwz `6`,`0`(`12`)
104	lwz `7`,`4`(`12`)
105	addi `11`,`12`,`8`
106	mtctr `8`
107	stw `6`,`0`(`3`)
108	stw `7`,`4`(`3`)
109	addi `10`,`3`,`8`
110	bf `31`,`4f`
111	lwz `0`,`8`(`12`)
112	stw `0`,`8`(`3`)
113	blt cr1,`3f`
114	addi `11`,`12`,`12`
115	addi `10`,`3`,`12`
116	b `4f`
117	.align `4`
118	`1`:
119	mr `10`,`3`
120	mtctr `8`
121	bf `31`,`4f`
122	lwz `6`,`0`(`12`)
123	addi `11`,`12`,`4`
124	stw `6`,`0`(`3`)
125	addi `10`,`3`,`4`
126
127	.align `4`
128	`4`:
129	lwz `6`,`0`(`11`)
130	lwz `7`,`4`(`11`)
131	lwz `8`,`8`(`11`)
132	lwz `0`,`12`(`11`)
133	stw `6`,`0`(`10`)
134	stw `7`,`4`(`10`)
135	stw `8`,`8`(`10`)
136	stw `0`,`12`(`10`)
137	addi `11`,`11`,`16`
138	addi `10`,`10`,`16`
139	bdnz `4b`
140	`3`:
141	clrrwi `0`,`31`,`2`
142	mtcrf `0x01`,`31`
143	beq cr6,`0f`
144	.L9:
145	add `3`,`3`,`0`
146	add `12`,`12`,`0`
147
148	/ At this point we have a tail of 0-3 bytes and we know that the*
149	destination is word aligned. /*
150	`2`: bf `30`,`1f`
151	lhz `6`,`0`(`12`)
152	addi `12`,`12`,`2`
153	sth `6`,`0`(`3`)
154	addi `3`,`3`,`2`
155	`1`: bf `31`,`0f`
156	lbz `6`,`0`(`12`)
157	stb `6`,`0`(`3`)
158	`0`:
159	/ Return original dst pointer. /
160	mr `3`,`30`
161	lwz `30`,`20`(`1`)
162	lwz `31`,`24`(`1`)
163	addi `1`,`1`,`32`
164	blr
165
166	/ Copy up to 31 bytes. This is divided into two cases 0-8 bytes and*
167	9-31 bytes. Each case is handled without loops, using binary
168	(1,2,4,8) tests.
169
170	In the short (0-8 byte) case no attempt is made to force alignment
171	of either source or destination. The hardware will handle the
172	unaligned load/stores with small delays for crossing 32- 64-byte, and
173	4096-byte boundaries. Since these short moves are unlikely to be
174	unaligned or cross these boundaries, the overhead to force
175	alignment is not justified.
176
177	The longer (9-31 byte) move is more likely to cross 32- or 64-byte
178	boundaries. Since only loads are sensitive to the 32-/64-byte
179	boundaries it is more important to align the source than the
180	destination. If the source is not already word aligned, we first
181	move 1-3 bytes as needed. While the destination and stores may
182	still be unaligned, this is only an issue for page (4096 byte
183	boundary) crossing, which should be rare for these short moves.
184	The hardware handles this case automatically with a small delay. /*
185
186	.align `4`
187	.L2:
188	mtcrf `0x01`,`5`
189	neg `8`,`4`
190	clrrwi `11`,`4`,`2`
191	andi. `0`,`8`,`3`
192	ble cr6,.LE8 / Handle moves of 0-8 bytes. /
193	/ At least 9 bytes left. Get the source word aligned. /
194	cmplwi cr1,`5`,`16`
195	mr `10`,`5`
196	mr `12`,`4`
197	cmplwi cr6,`0`,`2`
198	beq .L3 / If the source is already word aligned skip this. /
199	/ Copy 1-3 bytes to get source address word aligned. /
200	lwz `6`,`0`(`11`)
201	subf `10`,`0`,`5`
202	add `12`,`4`,`0`
203	blt cr6,`5f`
204	srwi `7`,`6`,`16`
205	bgt cr6,`3f`
206	#ifdef __LITTLE_ENDIAN__
207	sth `7`,`0`(`3`)
208	#else
209	sth `6`,`0`(`3`)
210	#endif
211	b `7f`
212	.align `4`
213	`3`:
214	#ifdef __LITTLE_ENDIAN__
215	rotlwi `6`,`6`,`24`
216	stb `6`,`0`(`3`)
217	sth `7`,`1`(`3`)
218	#else
219	stb `7`,`0`(`3`)
220	sth `6`,`1`(`3`)
221	#endif
222	b `7f`
223	.align `4`
224	`5`:
225	#ifdef __LITTLE_ENDIAN__
226	rotlwi `6`,`6`,`8`
227	#endif
228	stb `6`,`0`(`3`)
229	`7`:
230	cmplwi cr1,`10`,`16`
231	add `3`,`3`,`0`
232	mtcrf `0x01`,`10`
233	.align `4`
234	.L3:
235	/ At least 6 bytes left and the source is word aligned. /
236	blt cr1,`8f`
237	`16`: / Move 16 bytes. /
238	lwz `6`,`0`(`12`)
239	lwz `7`,`4`(`12`)
240	stw `6`,`0`(`3`)
241	lwz `6`,`8`(`12`)
242	stw `7`,`4`(`3`)
243	lwz `7`,`12`(`12`)
244	addi `12`,`12`,`16`
245	stw `6`,`8`(`3`)
246	stw `7`,`12`(`3`)
247	addi `3`,`3`,`16`
248	`8`: / Move 8 bytes. /
249	bf `28`,`4f`
250	lwz `6`,`0`(`12`)
251	lwz `7`,`4`(`12`)
252	addi `12`,`12`,`8`
253	stw `6`,`0`(`3`)
254	stw `7`,`4`(`3`)
255	addi `3`,`3`,`8`
256	`4`: / Move 4 bytes. /
257	bf `29`,`2f`
258	lwz `6`,`0`(`12`)
259	addi `12`,`12`,`4`
260	stw `6`,`0`(`3`)
261	addi `3`,`3`,`4`
262	`2`: / Move 2-3 bytes. /
263	bf `30`,`1f`
264	lhz `6`,`0`(`12`)
265	sth `6`,`0`(`3`)
266	bf `31`,`0f`
267	lbz `7`,`2`(`12`)
268	stb `7`,`2`(`3`)
269	mr `3`,`30`
270	lwz `30`,`20`(`1`)
271	addi `1`,`1`,`32`
272	blr
273	`1`: / Move 1 byte. /
274	bf `31`,`0f`
275	lbz `6`,`0`(`12`)
276	stb `6`,`0`(`3`)
277	`0`:
278	/ Return original dst pointer. /
279	mr `3`,`30`
280	lwz `30`,`20`(`1`)
281	addi `1`,`1`,`32`
282	blr
283
284	/ Special case to copy 0-8 bytes. /
285	.align `4`
286	.LE8:
287	mr `12`,`4`
288	bne cr6,`4f`
289	lwz `6`,`0`(`4`)
290	lwz `7`,`4`(`4`)
291	stw `6`,`0`(`3`)
292	stw `7`,`4`(`3`)
293	/ Return original dst pointer. /
294	mr `3`,`30`
295	lwz `30`,`20`(`1`)
296	addi `1`,`1`,`32`
297	blr
298	.align `4`
299	`4`: bf `29`,`2b`
300	lwz `6`,`0`(`4`)
301	stw `6`,`0`(`3`)
302	`6`:
303	bf `30`,`5f`
304	lhz `7`,`4`(`4`)
305	sth `7`,`4`(`3`)
306	bf `31`,`0f`
307	lbz `8`,`6`(`4`)
308	stb `8`,`6`(`3`)
309	mr `3`,`30`
310	lwz `30`,`20`(`1`)
311	addi `1`,`1`,`32`
312	blr
313	.align `4`
314	`5`:
315	bf `31`,`0f`
316	lbz `6`,`4`(`4`)
317	stb `6`,`4`(`3`)
318	.align `4`
319	`0`:
320	/ Return original dst pointer. /
321	mr `3`,`30`
322	lwz `30`,`20`(`1`)
323	addi `1`,`1`,`32`
324	blr
325
326	.align `4`
327	.L6:
328
329	/ Copy words where the destination is aligned but the source is*
330	not. Use aligned word loads from the source, shifted to realign
331	the data, to allow aligned destination stores.
332	Use an unrolled loop to copy 4 words (16-bytes) per iteration.
333	A single word is retained for storing at loop exit to avoid walking
334	off the end of a page within the loop.
335	If the copy is not an exact multiple of 16 bytes, 1-3
336	words are copied as needed to set up the main loop. After
337	the main loop exits there may be a tail of 1-3 bytes. These bytes are
338	copied a halfword/byte at a time as needed to preserve alignment. /*
339
340
341	cmplwi cr6,`11`,`0` / are there tail bytes left ? /
342	subf `5`,`10`,`12` / back up src pointer to prev word alignment /
343	slwi `10`,`10`,`3` / calculate number of bits to shift 1st word left /
344	addi `11`,`9`,-`1` / we move one word after the loop /
345	srwi `8`,`11`,`2` / calculate the 16 byte loop count /
346	lwz `6`,`0`(`5`) / load 1st src word into R6 /
347	mr `4`,`3`
348	lwz `7`,`4`(`5`) / load 2nd src word into R7 /
349	mtcrf `0x01`,`11`
350	subfic `9`,`10`,`32` / number of bits to shift 2nd word right /
351	mtctr `8`
352	bf `30`,`1f`
353
354	/ there are at least two words to copy, so copy them /
355	#ifdef __LITTLE_ENDIAN__
356	srw `0`,`6`,`10`
357	slw `8`,`7`,`9`
358	#else
359	slw `0`,`6`,`10` / shift 1st src word to left align it in R0 /
360	srw `8`,`7`,`9` / shift 2nd src word to right align it in R8 /
361	#endif
362	or `0`,`0`,`8` / or them to get word to store /
363	lwz `6`,`8`(`5`) / load the 3rd src word /
364	stw `0`,`0`(`4`) / store the 1st dst word /
365	#ifdef __LITTLE_ENDIAN__
366	srw `0`,`7`,`10`
367	slw `8`,`6`,`9`
368	#else
369	slw `0`,`7`,`10` / now left align 2nd src word into R0 /
370	srw `8`,`6`,`9` / shift 3rd src word to right align it in R8 /
371	#endif
372	or `0`,`0`,`8` / or them to get word to store /
373	lwz `7`,`12`(`5`)
374	stw `0`,`4`(`4`) / store the 2nd dst word /
375	addi `4`,`4`,`8`
376	addi `5`,`5`,`16`
377	bf `31`,`4f`
378	/ there is a third word to copy, so copy it /
379	#ifdef __LITTLE_ENDIAN__
380	srw `0`,`6`,`10`
381	slw `8`,`7`,`9`
382	#else
383	slw `0`,`6`,`10` / shift 3rd src word to left align it in R0 /
384	srw `8`,`7`,`9` / shift 4th src word to right align it in R8 /
385	#endif
386	or `0`,`0`,`8` / or them to get word to store /
387	stw `0`,`0`(`4`) / store 3rd dst word /
388	mr `6`,`7`
389	lwz `7`,`0`(`5`)
390	addi `5`,`5`,`4`
391	addi `4`,`4`,`4`
392	b `4f`
393	.align `4`
394	`1`:
395	#ifdef __LITTLE_ENDIAN__
396	srw `0`,`6`,`10`
397	slw `8`,`7`,`9`
398	#else
399	slw `0`,`6`,`10` / shift 1st src word to left align it in R0 /
400	srw `8`,`7`,`9` / shift 2nd src word to right align it in R8 /
401	#endif
402	addi `5`,`5`,`8`
403	or `0`,`0`,`8` / or them to get word to store /
404	bf `31`,`4f`
405	mr `6`,`7`
406	lwz `7`,`0`(`5`)
407	addi `5`,`5`,`4`
408	stw `0`,`0`(`4`) / store the 1st dst word /
409	addi `4`,`4`,`4`
410
411	.align `4`
412	`4`:
413	/ copy 16 bytes at a time /
414	#ifdef __LITTLE_ENDIAN__
415	srw `0`,`6`,`10`
416	slw `8`,`7`,`9`
417	#else
418	slw `0`,`6`,`10`
419	srw `8`,`7`,`9`
420	#endif
421	or `0`,`0`,`8`
422	lwz `6`,`0`(`5`)
423	stw `0`,`0`(`4`)
424	#ifdef __LITTLE_ENDIAN__
425	srw `0`,`7`,`10`
426	slw `8`,`6`,`9`
427	#else
428	slw `0`,`7`,`10`
429	srw `8`,`6`,`9`
430	#endif
431	or `0`,`0`,`8`
432	lwz `7`,`4`(`5`)
433	stw `0`,`4`(`4`)
434	#ifdef __LITTLE_ENDIAN__
435	srw `0`,`6`,`10`
436	slw `8`,`7`,`9`
437	#else
438	slw `0`,`6`,`10`
439	srw `8`,`7`,`9`
440	#endif
441	or `0`,`0`,`8`
442	lwz `6`,`8`(`5`)
443	stw `0`,`8`(`4`)
444	#ifdef __LITTLE_ENDIAN__
445	srw `0`,`7`,`10`
446	slw `8`,`6`,`9`
447	#else
448	slw `0`,`7`,`10`
449	srw `8`,`6`,`9`
450	#endif
451	or `0`,`0`,`8`
452	lwz `7`,`12`(`5`)
453	stw `0`,`12`(`4`)
454	addi `5`,`5`,`16`
455	addi `4`,`4`,`16`
456	bdnz+ `4b`
457	`8`:
458	/ calculate and store the final word /
459	#ifdef __LITTLE_ENDIAN__
460	srw `0`,`6`,`10`
461	slw `8`,`7`,`9`
462	#else
463	slw `0`,`6`,`10`
464	srw `8`,`7`,`9`
465	#endif
466	or `0`,`0`,`8`
467	stw `0`,`0`(`4`)
468	`3`:
469	clrrwi `0`,`31`,`2`
470	mtcrf `0x01`,`31`
471	bne cr6,.L9 / If the tail is 0 bytes we are done! /
472
473	/ Return original dst pointer. /
474	mr `3`,`30`
475	lwz `30`,`20`(`1`)
476	lwz `31`,`24`(`1`)
477	addi `1`,`1`,`32`
478	blr
479	END (memcpy)
480
481	libc_hidden_builtin_def (memcpy)
482

source code of glibc/sysdeps/powerpc/powerpc32/power4/memcpy.S