memcpy.S source code [glibc/sysdeps/powerpc/powerpc64/power7/memcpy.S]

1	/ Optimized memcpy implementation for PowerPC64/POWER7.*
2	Copyright (C) 2010-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21
22	/ void * [r3] memcpy (void dst [r3], void src [r4], size_t len [r5]);*
23	Returns 'dst'. /*
24
25	#ifndef MEMCPY
26	# define MEMCPY memcpy
27	#endif
28
29	#define dst 11 /* Use r11 so r3 kept unchanged. */
30	#define src 4
31	#define cnt 5
32
33	.machine power7
34	ENTRY_TOCLESS (MEMCPY, `5`)
35	CALL_MCOUNT `3`
36
37	cmpldi cr1,cnt,`31`
38	neg `0`,`3`
39	ble cr1, L(copy_LT_32) / If move < 32 bytes use short move*
40	code. /*
41
42	/ Align copies using VSX instructions to quadword. It is to avoid alignment*
43	traps when memcpy is used on non-cacheable memory (for instance, memory
44	mapped I/O). /*
45	andi. `10`,`3`,`15`
46	clrldi `11`,`4`,`60`
47	cmpld cr6,`10`,`11` / SRC and DST alignments match? /
48
49	mr dst,`3`
50	bne cr6,L(copy_GE_32_unaligned)
51	beq L(aligned_copy)
52
53	mtocrf `0x01`,`0`
54	clrldi `0`,`0`,`60`
55
56	/ Get the DST and SRC aligned to 16 bytes. /
57	`1`:
58	bf `31`,`2f`
59	lbz `6`,`0`(src)
60	addi src,src,`1`
61	stb `6`,`0`(dst)
62	addi dst,dst,`1`
63	`2`:
64	bf `30`,`4f`
65	lhz `6`,`0`(src)
66	addi src,src,`2`
67	sth `6`,`0`(dst)
68	addi dst,dst,`2`
69	`4`:
70	bf `29`,`8f`
71	lwz `6`,`0`(src)
72	addi src,src,`4`
73	stw `6`,`0`(dst)
74	addi dst,dst,`4`
75	`8`:
76	bf `28`,`16f`
77	ld `6`,`0`(src)
78	addi src,src,`8`
79	std `6`,`0`(dst)
80	addi dst,dst,`8`
81	`16`:
82	subf cnt,`0`,cnt
83
84	/ Main aligned copy loop. Copies 128 bytes at a time. /
85	L(aligned_copy):
86	li `6`,`16`
87	li `7`,`32`
88	li `8`,`48`
89	mtocrf `0x02`,cnt
90	srdi `12`,cnt,`7`
91	cmpdi `12`,`0`
92	beq L(aligned_tail)
93	lvx `6`,`0`,src
94	lvx `7`,src,`6`
95	mtctr `12`
96	b L(aligned_128loop)
97
98	.align `4`
99	L(aligned_128head):
100	/ for the 2nd + iteration of this loop. /
101	lvx `6`,`0`,src
102	lvx `7`,src,`6`
103	L(aligned_128loop):
104	lvx `8`,src,`7`
105	lvx `9`,src,`8`
106	stvx `6`,`0`,dst
107	addi src,src,`64`
108	stvx `7`,dst,`6`
109	stvx `8`,dst,`7`
110	stvx `9`,dst,`8`
111	lvx `6`,`0`,src
112	lvx `7`,src,`6`
113	addi dst,dst,`64`
114	lvx `8`,src,`7`
115	lvx `9`,src,`8`
116	addi src,src,`64`
117	stvx `6`,`0`,dst
118	stvx `7`,dst,`6`
119	stvx `8`,dst,`7`
120	stvx `9`,dst,`8`
121	addi dst,dst,`64`
122	bdnz L(aligned_128head)
123
124	L(aligned_tail):
125	mtocrf `0x01`,cnt
126	bf `25`,`32f`
127	lvx `6`,`0`,src
128	lvx `7`,src,`6`
129	lvx `8`,src,`7`
130	lvx `9`,src,`8`
131	addi src,src,`64`
132	stvx `6`,`0`,dst
133	stvx `7`,dst,`6`
134	stvx `8`,dst,`7`
135	stvx `9`,dst,`8`
136	addi dst,dst,`64`
137	`32`:
138	bf `26`,`16f`
139	lvx `6`,`0`,src
140	lvx `7`,src,`6`
141	addi src,src,`32`
142	stvx `6`,`0`,dst
143	stvx `7`,dst,`6`
144	addi dst,dst,`32`
145	`16`:
146	bf `27`,`8f`
147	lvx `6`,`0`,src
148	addi src,src,`16`
149	stvx `6`,`0`,dst
150	addi dst,dst,`16`
151	`8`:
152	bf `28`,`4f`
153	ld `6`,`0`(src)
154	addi src,src,`8`
155	std `6`,`0`(dst)
156	addi dst,dst,`8`
157	`4`: / Copies 4~7 bytes. /
158	bf `29`,L(tail2)
159	lwz `6`,`0`(src)
160	stw `6`,`0`(dst)
161	bf `30`,L(tail5)
162	lhz `7`,`4`(src)
163	sth `7`,`4`(dst)
164	bflr `31`
165	lbz `8`,`6`(src)
166	stb `8`,`6`(dst)
167	/ Return original DST pointer. /
168	blr
169
170
171	/ Handle copies of 0~31 bytes. /
172	.align `4`
173	L(copy_LT_32):
174	mr dst,`3`
175	cmpldi cr6,cnt,`8`
176	mtocrf `0x01`,cnt
177	ble cr6,L(copy_LE_8)
178
179	/ At least 9 bytes to go. /
180	neg `8`,`4`
181	andi. `0`,`8`,`3`
182	cmpldi cr1,cnt,`16`
183	beq L(copy_LT_32_aligned)
184
185	/ Force 4-byte alignment for SRC. /
186	mtocrf `0x01`,`0`
187	subf cnt,`0`,cnt
188	`2`:
189	bf `30`,`1f`
190	lhz `6`,`0`(src)
191	addi src,src,`2`
192	sth `6`,`0`(dst)
193	addi dst,dst,`2`
194	`1`:
195	bf `31`,L(end_4bytes_alignment)
196	lbz `6`,`0`(src)
197	addi src,src,`1`
198	stb `6`,`0`(dst)
199	addi dst,dst,`1`
200
201	.align `4`
202	L(end_4bytes_alignment):
203	cmpldi cr1,cnt,`16`
204	mtocrf `0x01`,cnt
205
206	L(copy_LT_32_aligned):
207	/ At least 6 bytes to go, and SRC is word-aligned. /
208	blt cr1,`8f`
209
210	/ Copy 16 bytes. /
211	lwz `6`,`0`(src)
212	lwz `7`,`4`(src)
213	stw `6`,`0`(dst)
214	lwz `8`,`8`(src)
215	stw `7`,`4`(dst)
216	lwz `6`,`12`(src)
217	addi src,src,`16`
218	stw `8`,`8`(dst)
219	stw `6`,`12`(dst)
220	addi dst,dst,`16`
221	`8`: / Copy 8 bytes. /
222	bf `28`,L(tail4)
223	lwz `6`,`0`(src)
224	lwz `7`,`4`(src)
225	addi src,src,`8`
226	stw `6`,`0`(dst)
227	stw `7`,`4`(dst)
228	addi dst,dst,`8`
229
230	.align `4`
231	/ Copies 4~7 bytes. /
232	L(tail4):
233	bf `29`,L(tail2)
234	lwz `6`,`0`(src)
235	stw `6`,`0`(dst)
236	bf `30`,L(tail5)
237	lhz `7`,`4`(src)
238	sth `7`,`4`(dst)
239	bflr `31`
240	lbz `8`,`6`(src)
241	stb `8`,`6`(dst)
242	/ Return original DST pointer. /
243	blr
244
245	.align `4`
246	/ Copies 2~3 bytes. /
247	L(tail2):
248	bf `30`,`1f`
249	lhz `6`,`0`(src)
250	sth `6`,`0`(dst)
251	bflr `31`
252	lbz `7`,`2`(src)
253	stb `7`,`2`(dst)
254	blr
255
256	.align `4`
257	L(tail5):
258	bflr `31`
259	lbz `6`,`4`(src)
260	stb `6`,`4`(dst)
261	blr
262
263	.align `4`
264	`1`:
265	bflr `31`
266	lbz `6`,`0`(src)
267	stb `6`,`0`(dst)
268	/ Return original DST pointer. /
269	blr
270
271
272	/ Handles copies of 0~8 bytes. /
273	.align `4`
274	L(copy_LE_8):
275	bne cr6,L(tail4)
276
277	/ Though we could've used ld/std here, they are still*
278	slow for unaligned cases. /*
279
280	lwz `6`,`0`(src)
281	lwz `7`,`4`(src)
282	stw `6`,`0`(dst)
283	stw `7`,`4`(dst)
284	blr
285
286
287	/ Handle copies of 32+ bytes where DST is aligned (to quadword) but*
288	SRC is not. Use aligned quadword loads from SRC, shifted to realign
289	the data, allowing for aligned DST stores. /*
290	.align `4`
291	L(copy_GE_32_unaligned):
292	clrldi `0`,`0`,`60` / Number of bytes until the 1st dst quadword. /
293	srdi `9`,cnt,`4` / Number of full quadwords remaining. /
294
295	beq L(copy_GE_32_unaligned_cont)
296
297	/ DST is not quadword aligned, get it aligned. /
298
299	mtocrf `0x01`,`0`
300	subf cnt,`0`,cnt
301
302	/ Vector instructions work best when proper alignment (16-bytes)*
303	is present. Move 0~15 bytes as needed to get DST quadword-aligned. /*
304	`1`:
305	bf `31`,`2f`
306	lbz `6`,`0`(src)
307	addi src,src,`1`
308	stb `6`,`0`(dst)
309	addi dst,dst,`1`
310	`2`:
311	bf `30`,`4f`
312	lhz `6`,`0`(src)
313	addi src,src,`2`
314	sth `6`,`0`(dst)
315	addi dst,dst,`2`
316	`4`:
317	bf `29`,`8f`
318	lwz `6`,`0`(src)
319	addi src,src,`4`
320	stw `6`,`0`(dst)
321	addi dst,dst,`4`
322	`8`:
323	bf `28`,`0f`
324	ld `6`,`0`(src)
325	addi src,src,`8`
326	std `6`,`0`(dst)
327	addi dst,dst,`8`
328	`0`:
329	srdi `9`,cnt,`4` / Number of full quadwords remaining. /
330
331	/ The proper alignment is present, it is OK to copy the bytes now. /
332	L(copy_GE_32_unaligned_cont):
333
334	/ Setup two indexes to speed up the indexed vector operations. /
335	clrldi `10`,cnt,`60`
336	li `6`,`16` / Index for 16-bytes offsets. /
337	li `7`,`32` / Index for 32-bytes offsets. /
338	cmpldi cr1,`10`,`0`
339	srdi `8`,cnt,`5` / Setup the loop counter. /
340	mtocrf `0x01`,`9`
341	cmpldi cr6,`9`,`1`
342	#ifdef __LITTLE_ENDIAN__
343	lvsr `5`,`0`,src
344	#else
345	lvsl `5`,`0`,src
346	#endif
347	lvx `3`,`0`,src
348	li `0`,`0`
349	bf `31`,L(setup_unaligned_loop)
350
351	/ Copy another 16 bytes to align to 32-bytes due to the loop. /
352	lvx `4`,src,`6`
353	#ifdef __LITTLE_ENDIAN__
354	vperm `6`,`4`,`3`,`5`
355	#else
356	vperm `6`,`3`,`4`,`5`
357	#endif
358	addi src,src,`16`
359	stvx `6`,`0`,dst
360	addi dst,dst,`16`
361	vor `3`,`4`,`4`
362	clrrdi `0`,src,`60`
363
364	L(setup_unaligned_loop):
365	mtctr `8`
366	ble cr6,L(end_unaligned_loop)
367
368	/ Copy 32 bytes at a time using vector instructions. /
369	.align `4`
370	L(unaligned_loop):
371
372	/ Note: vr6/vr10 may contain data that was already copied,*
373	but in order to get proper alignment, we may have to copy
374	some portions again. This is faster than having unaligned
375	vector instructions though. /*
376
377	lvx `4`,src,`6`
378	#ifdef __LITTLE_ENDIAN__
379	vperm `6`,`4`,`3`,`5`
380	#else
381	vperm `6`,`3`,`4`,`5`
382	#endif
383	lvx `3`,src,`7`
384	#ifdef __LITTLE_ENDIAN__
385	vperm `10`,`3`,`4`,`5`
386	#else
387	vperm `10`,`4`,`3`,`5`
388	#endif
389	addi src,src,`32`
390	stvx `6`,`0`,dst
391	stvx `10`,dst,`6`
392	addi dst,dst,`32`
393	bdnz L(unaligned_loop)
394
395	clrrdi `0`,src,`60`
396
397	.align `4`
398	L(end_unaligned_loop):
399
400	/ Check for tail bytes. /
401	mtocrf `0x01`,cnt
402	beqlr cr1
403
404	add src,src,`0`
405
406	/ We have 1~15 tail bytes to copy, and DST is quadword aligned. /
407	/ Copy 8 bytes. /
408	bf `28`,`4f`
409	lwz `6`,`0`(src)
410	lwz `7`,`4`(src)
411	addi src,src,`8`
412	stw `6`,`0`(dst)
413	stw `7`,`4`(dst)
414	addi dst,dst,`8`
415	`4`: / Copy 4~7 bytes. /
416	bf `29`,L(tail2)
417	lwz `6`,`0`(src)
418	stw `6`,`0`(dst)
419	bf `30`,L(tail5)
420	lhz `7`,`4`(src)
421	sth `7`,`4`(dst)
422	bflr `31`
423	lbz `8`,`6`(src)
424	stb `8`,`6`(dst)
425	/ Return original DST pointer. /
426	blr
427
428	END_GEN_TB (MEMCPY,TB_TOCLESS)
429	libc_hidden_builtin_def (memcpy)
430

source code of glibc/sysdeps/powerpc/powerpc64/power7/memcpy.S