mempcpy.S source code [glibc/sysdeps/powerpc/powerpc64/power7/mempcpy.S]

1	/ Optimized mempcpy implementation for POWER7.*
2	Copyright (C) 2010-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21
22	/ void * [r3] __mempcpy (void dst [r3], void src [r4], size_t len [r5]);*
23	Returns 'dst' + 'len'. /*
24
25	#ifndef MEMPCPY
26	# define MEMPCPY __mempcpy
27	#endif
28	.machine power7
29	ENTRY_TOCLESS (MEMPCPY, `5`)
30	CALL_MCOUNT `3`
31
32	cmpldi cr1,`5`,`31`
33	neg `0`,`3`
34	std `3`,-`16`(`1`)
35	std `31`,-`8`(`1`)
36	cfi_offset(`31`,-`8`)
37	ble cr1,L(copy_LT_32) / If move < 32 bytes use short move*
38	code. /*
39
40	andi. `11`,`3`,`7` / Check alignment of DST. /
41
42
43	clrldi `10`,`4`,`61` / Check alignment of SRC. /
44	cmpld cr6,`10`,`11` / SRC and DST alignments match? /
45	mr `12`,`4`
46	mr `31`,`5`
47	bne cr6,L(copy_GE_32_unaligned)
48
49	srdi `9`,`5`,`3` / Number of full quadwords remaining. /
50
51	beq L(copy_GE_32_aligned_cont)
52
53	clrldi `0`,`0`,`61`
54	mtcrf `0x01`,`0`
55	subf `31`,`0`,`5`
56
57	/ Get the SRC aligned to 8 bytes. /
58
59	`1`: bf `31`,`2f`
60	lbz `6`,`0`(`12`)
61	addi `12`,`12`,`1`
62	stb `6`,`0`(`3`)
63	addi `3`,`3`,`1`
64	`2`: bf `30`,`4f`
65	lhz `6`,`0`(`12`)
66	addi `12`,`12`,`2`
67	sth `6`,`0`(`3`)
68	addi `3`,`3`,`2`
69	`4`: bf `29`,`0f`
70	lwz `6`,`0`(`12`)
71	addi `12`,`12`,`4`
72	stw `6`,`0`(`3`)
73	addi `3`,`3`,`4`
74	`0`:
75	clrldi `10`,`12`,`61` / Check alignment of SRC again. /
76	srdi `9`,`31`,`3` / Number of full doublewords remaining. /
77
78	L(copy_GE_32_aligned_cont):
79
80	clrldi `11`,`31`,`61`
81	mtcrf `0x01`,`9`
82
83	srdi `8`,`31`,`5`
84	cmpldi cr1,`9`,`4`
85	cmpldi cr6,`11`,`0`
86	mr `11`,`12`
87
88	/ Copy 1~3 doublewords so the main loop starts*
89	at a multiple of 32 bytes. /*
90
91	bf `30`,`1f`
92	ld `6`,`0`(`12`)
93	ld `7`,`8`(`12`)
94	addi `11`,`12`,`16`
95	mtctr `8`
96	std `6`,`0`(`3`)
97	std `7`,`8`(`3`)
98	addi `10`,`3`,`16`
99	bf `31`,`4f`
100	ld `0`,`16`(`12`)
101	std `0`,`16`(`3`)
102	blt cr1,`3f`
103	addi `11`,`12`,`24`
104	addi `10`,`3`,`24`
105	b `4f`
106
107	.align `4`
108	`1`: / Copy 1 doubleword and set the counter. /
109	mr `10`,`3`
110	mtctr `8`
111	bf `31`,`4f`
112	ld `6`,`0`(`12`)
113	addi `11`,`12`,`8`
114	std `6`,`0`(`3`)
115	addi `10`,`3`,`8`
116
117	/ Main aligned copy loop. Copies 32-bytes at a time. /
118	.align `4`
119	`4`:
120	ld `6`,`0`(`11`)
121	ld `7`,`8`(`11`)
122	ld `8`,`16`(`11`)
123	ld `0`,`24`(`11`)
124	addi `11`,`11`,`32`
125
126	std `6`,`0`(`10`)
127	std `7`,`8`(`10`)
128	std `8`,`16`(`10`)
129	std `0`,`24`(`10`)
130	addi `10`,`10`,`32`
131	bdnz `4b`
132	`3`:
133
134	/ Check for tail bytes. /
135	rldicr `0`,`31`,`0`,`60`
136	mtcrf `0x01`,`31`
137	beq cr6,`0f`
138
139	.L9:
140	add `3`,`3`,`0`
141	add `12`,`12`,`0`
142
143	/ At this point we have a tail of 0-7 bytes and we know that the*
144	destination is doubleword-aligned. /*
145	`4`: / Copy 4 bytes. /
146	bf `29`,`2f`
147
148	lwz `6`,`0`(`12`)
149	addi `12`,`12`,`4`
150	stw `6`,`0`(`3`)
151	addi `3`,`3`,`4`
152	`2`: / Copy 2 bytes. /
153	bf `30`,`1f`
154
155	lhz `6`,`0`(`12`)
156	addi `12`,`12`,`2`
157	sth `6`,`0`(`3`)
158	addi `3`,`3`,`2`
159	`1`: / Copy 1 byte. /
160	bf `31`,`0f`
161
162	lbz `6`,`0`(`12`)
163	stb `6`,`0`(`3`)
164	`0`: / Return DST + LEN pointer. /
165	ld `31`,-`8`(`1`)
166	ld `3`,-`16`(`1`)
167	add `3`,`3`,`5`
168	blr
169
170	/ Handle copies of 0~31 bytes. /
171	.align `4`
172	L(copy_LT_32):
173	cmpldi cr6,`5`,`8`
174	mr `12`,`4`
175	mtcrf `0x01`,`5`
176	ble cr6,L(copy_LE_8)
177
178	/ At least 9 bytes to go. /
179	neg `8`,`4`
180	clrrdi `11`,`4`,`2`
181	andi. `0`,`8`,`3`
182	cmpldi cr1,`5`,`16`
183	mr `10`,`5`
184	beq L(copy_LT_32_aligned)
185
186	/ Force 4-bytes alignment for SRC. /
187	mtocrf `0x01`,`0`
188	subf `10`,`0`,`5`
189	`2`: bf `30`,`1f`
190
191	lhz `6`,`0`(`12`)
192	addi `12`,`12`,`2`
193	sth `6`,`0`(`3`)
194	addi `3`,`3`,`2`
195	`1`: bf `31`,L(end_4bytes_alignment)
196
197	lbz `6`,`0`(`12`)
198	addi `12`,`12`,`1`
199	stb `6`,`0`(`3`)
200	addi `3`,`3`,`1`
201
202	.align `4`
203	L(end_4bytes_alignment):
204	cmpldi cr1,`10`,`16`
205	mtcrf `0x01`,`10`
206
207	L(copy_LT_32_aligned):
208	/ At least 6 bytes to go, and SRC is word-aligned. /
209	blt cr1,`8f`
210
211	/ Copy 16 bytes. /
212	lwz `6`,`0`(`12`)
213	lwz `7`,`4`(`12`)
214	stw `6`,`0`(`3`)
215	lwz `8`,`8`(`12`)
216	stw `7`,`4`(`3`)
217	lwz `6`,`12`(`12`)
218	addi `12`,`12`,`16`
219	stw `8`,`8`(`3`)
220	stw `6`,`12`(`3`)
221	addi `3`,`3`,`16`
222	`8`: / Copy 8 bytes. /
223	bf `28`,`4f`
224
225	lwz `6`,`0`(`12`)
226	lwz `7`,`4`(`12`)
227	addi `12`,`12`,`8`
228	stw `6`,`0`(`3`)
229	stw `7`,`4`(`3`)
230	addi `3`,`3`,`8`
231	`4`: / Copy 4 bytes. /
232	bf `29`,`2f`
233
234	lwz `6`,`0`(`12`)
235	addi `12`,`12`,`4`
236	stw `6`,`0`(`3`)
237	addi `3`,`3`,`4`
238	`2`: / Copy 2-3 bytes. /
239	bf `30`,`1f`
240
241	lhz `6`,`0`(`12`)
242	sth `6`,`0`(`3`)
243	bf `31`,`0f`
244	lbz `7`,`2`(`12`)
245	stb `7`,`2`(`3`)
246	ld `3`,-`16`(`1`)
247	add `3`,`3`,`5`
248	blr
249
250	.align `4`
251	`1`: / Copy 1 byte. /
252	bf `31`,`0f`
253
254	lbz `6`,`0`(`12`)
255	stb `6`,`0`(`3`)
256	`0`: / Return DST + LEN pointer. /
257	ld `3`,-`16`(`1`)
258	add `3`,`3`,`5`
259	blr
260
261	/ Handles copies of 0~8 bytes. /
262	.align `4`
263	L(copy_LE_8):
264	bne cr6,`4f`
265
266	/ Though we could've used ld/std here, they are still*
267	slow for unaligned cases. /*
268
269	lwz `6`,`0`(`4`)
270	lwz `7`,`4`(`4`)
271	stw `6`,`0`(`3`)
272	stw `7`,`4`(`3`)
273	ld `3`,-`16`(`1`) / Return DST + LEN pointer. /
274	add `3`,`3`,`5`
275	blr
276
277	.align `4`
278	`4`: / Copies 4~7 bytes. /
279	bf `29`,`2b`
280
281	lwz `6`,`0`(`4`)
282	stw `6`,`0`(`3`)
283	bf `30`,`5f`
284	lhz `7`,`4`(`4`)
285	sth `7`,`4`(`3`)
286	bf `31`,`0f`
287	lbz `8`,`6`(`4`)
288	stb `8`,`6`(`3`)
289	ld `3`,-`16`(`1`)
290	add `3`,`3`,`5`
291	blr
292
293	.align `4`
294	`5`: / Copy 1 byte. /
295	bf `31`,`0f`
296
297	lbz `6`,`4`(`4`)
298	stb `6`,`4`(`3`)
299
300	`0`: / Return DST + LEN pointer. /
301	ld `3`,-`16`(`1`)
302	add `3`,`3`,`5`
303	blr
304
305	/ Handle copies of 32+ bytes where DST is aligned (to quadword) but*
306	SRC is not. Use aligned quadword loads from SRC, shifted to realign
307	the data, allowing for aligned DST stores. /*
308	.align `4`
309	L(copy_GE_32_unaligned):
310	clrldi `0`,`0`,`60` / Number of bytes until the 1st*
311	quadword. /*
312	andi. `11`,`3`,`15` / Check alignment of DST (against*
313	quadwords). /*
314	srdi `9`,`5`,`4` / Number of full quadwords remaining. /
315
316	beq L(copy_GE_32_unaligned_cont)
317
318	/ SRC is not quadword aligned, get it aligned. /
319
320	mtcrf `0x01`,`0`
321	subf `31`,`0`,`5`
322
323	/ Vector instructions work best when proper alignment (16-bytes)*
324	is present. Move 0~15 bytes as needed to get DST quadword-aligned. /*
325	`1`: / Copy 1 byte. /
326	bf `31`,`2f`
327
328	lbz `6`,`0`(`12`)
329	addi `12`,`12`,`1`
330	stb `6`,`0`(`3`)
331	addi `3`,`3`,`1`
332	`2`: / Copy 2 bytes. /
333	bf `30`,`4f`
334
335	lhz `6`,`0`(`12`)
336	addi `12`,`12`,`2`
337	sth `6`,`0`(`3`)
338	addi `3`,`3`,`2`
339	`4`: / Copy 4 bytes. /
340	bf `29`,`8f`
341
342	lwz `6`,`0`(`12`)
343	addi `12`,`12`,`4`
344	stw `6`,`0`(`3`)
345	addi `3`,`3`,`4`
346	`8`: / Copy 8 bytes. /
347	bf `28`,`0f`
348
349	ld `6`,`0`(`12`)
350	addi `12`,`12`,`8`
351	std `6`,`0`(`3`)
352	addi `3`,`3`,`8`
353	`0`:
354	clrldi `10`,`12`,`60` / Check alignment of SRC. /
355	srdi `9`,`31`,`4` / Number of full quadwords remaining. /
356
357	/ The proper alignment is present, it is OK to copy the bytes now. /
358	L(copy_GE_32_unaligned_cont):
359
360	/ Setup two indexes to speed up the indexed vector operations. /
361	clrldi `11`,`31`,`60`
362	li `6`,`16` / Index for 16-bytes offsets. /
363	li `7`,`32` / Index for 32-bytes offsets. /
364	cmpldi cr1,`11`,`0`
365	srdi `8`,`31`,`5` / Setup the loop counter. /
366	mr `10`,`3`
367	mr `11`,`12`
368	mtcrf `0x01`,`9`
369	cmpldi cr6,`9`,`1`
370	#ifdef __LITTLE_ENDIAN__
371	lvsr `5`,`0`,`12`
372	#else
373	lvsl `5`,`0`,`12`
374	#endif
375	lvx `3`,`0`,`12`
376	bf `31`,L(setup_unaligned_loop)
377
378	/ Copy another 16 bytes to align to 32-bytes due to the loop . /
379	lvx `4`,`12`,`6`
380	#ifdef __LITTLE_ENDIAN__
381	vperm `6`,`4`,`3`,`5`
382	#else
383	vperm `6`,`3`,`4`,`5`
384	#endif
385	addi `11`,`12`,`16`
386	addi `10`,`3`,`16`
387	stvx `6`,`0`,`3`
388	vor `3`,`4`,`4`
389
390	L(setup_unaligned_loop):
391	mtctr `8`
392	ble cr6,L(end_unaligned_loop)
393
394	/ Copy 32 bytes at a time using vector instructions. /
395	.align `4`
396	L(unaligned_loop):
397
398	/ Note: vr6/vr10 may contain data that was already copied,*
399	but in order to get proper alignment, we may have to copy
400	some portions again. This is faster than having unaligned
401	vector instructions though. /*
402
403	lvx `4`,`11`,`6` / vr4 = r11+16. /
404	#ifdef __LITTLE_ENDIAN__
405	vperm `6`,`4`,`3`,`5`
406	#else
407	vperm `6`,`3`,`4`,`5`
408	#endif
409	lvx `3`,`11`,`7` / vr3 = r11+32. /
410	#ifdef __LITTLE_ENDIAN__
411	vperm `10`,`3`,`4`,`5`
412	#else
413	vperm `10`,`4`,`3`,`5`
414	#endif
415	addi `11`,`11`,`32`
416	stvx `6`,`0`,`10`
417	stvx `10`,`10`,`6`
418	addi `10`,`10`,`32`
419
420	bdnz L(unaligned_loop)
421
422	.align `4`
423	L(end_unaligned_loop):
424
425	/ Check for tail bytes. /
426	rldicr `0`,`31`,`0`,`59`
427	mtcrf `0x01`,`31`
428	beq cr1,`0f`
429
430	add `3`,`3`,`0`
431	add `12`,`12`,`0`
432
433	/ We have 1~15 tail bytes to copy, and DST is quadword aligned. /
434	`8`: / Copy 8 bytes. /
435	bf `28`,`4f`
436
437	lwz `6`,`0`(`12`)
438	lwz `7`,`4`(`12`)
439	addi `12`,`12`,`8`
440	stw `6`,`0`(`3`)
441	stw `7`,`4`(`3`)
442	addi `3`,`3`,`8`
443	`4`: / Copy 4 bytes. /
444	bf `29`,`2f`
445
446	lwz `6`,`0`(`12`)
447	addi `12`,`12`,`4`
448	stw `6`,`0`(`3`)
449	addi `3`,`3`,`4`
450	`2`: / Copy 2~3 bytes. /
451	bf `30`,`1f`
452
453	lhz `6`,`0`(`12`)
454	addi `12`,`12`,`2`
455	sth `6`,`0`(`3`)
456	addi `3`,`3`,`2`
457	`1`: / Copy 1 byte. /
458	bf `31`,`0f`
459
460	lbz `6`,`0`(`12`)
461	stb `6`,`0`(`3`)
462	`0`: / Return DST + LEN pointer. /
463	ld `31`,-`8`(`1`)
464	ld `3`,-`16`(`1`)
465	add `3`,`3`,`5`
466	blr
467
468	END_GEN_TB (MEMPCPY,TB_TOCLESS)
469	libc_hidden_def (__mempcpy)
470	weak_alias (__mempcpy, mempcpy)
471	libc_hidden_builtin_def (mempcpy)
472

source code of glibc/sysdeps/powerpc/powerpc64/power7/mempcpy.S