memcpy_impl.S source code [glibc/sysdeps/arm/armv7/multiarch/memcpy_impl.S]

1	/ NEON/VFP/ARM version of memcpy optimized for Cortex-A15.*
2	Copyright (C) 2013-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>.
18
19	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
20	of VFP or NEON when built with the appropriate flags.
21
22	Assumptions:
23
24	ARMv6 (ARMv7-a if using Neon)
25	ARM state
26	Unaligned accesses
27
28	*/
29
30	/ Thumb cannot encode negative immediate offsets in memory operations. /
31	#ifndef NO_THUMB
32	#define NO_THUMB
33	#endif
34	#include <sysdep.h>
35	#include <arm-features.h>
36
37	.syntax unified
38	/ This implementation requires ARM state. /
39	.arm
40
41	#ifdef MEMCPY_NEON
42
43	.fpu neon
44	.arch armv7-a
45	# define FRAME_SIZE 4
46	# define USE_VFP
47	# define USE_NEON
48
49	#elif defined (MEMCPY_VFP)
50
51	.arch armv6
52	.fpu vfpv2
53	# define FRAME_SIZE 32
54	# define USE_VFP
55
56	#else
57	.arch armv6
58	# define FRAME_SIZE 32
59
60	#endif
61
62	#define ALIGN(addr, align) addr:align
63
64	#define INSN_SIZE 4
65
66	/ Call parameters. /
67	#define dstin r0
68	#define src r1
69	#define count r2
70
71	/ Locals. /
72	#define tmp1 r3
73	#define dst ip
74	#define tmp2 r8
75
76	/ These two macros both work by repeated invocation of the macro*
77	dispatch_step (not defined here). That macro performs one "step",
78	doing one load instruction and one store instruction to copy one
79	"unit". On entry, TMP1 contains the number of bytes to be copied,
80	a multiple of the unit size. The macro clobbers TMP1 in the
81	process of doing a computed jump to the tail containing the
82	appropriate number of steps.
83
84	In dispatch_7_dword, dispatch_step is invoked seven times, with an
85	argument that is 7 for the first and 1 for the last. Units are
86	double-words (8 bytes). TMP1 is at most 56.
87
88	In dispatch_15_word, dispatch_step is invoked fifteen times,
89	with an argument that is 15 for the first and 1 for the last.
90	Units are words (4 bytes). TMP1 is at most 60. /*
91
92	#ifndef ARM_ALWAYS_BX
93	# if ARM_BX_ALIGN_LOG2 != 2
94	# error case not handled
95	# endif
96	.macro dispatch_7_dword
97	rsb tmp1, tmp1, #((`7` * `8`) - PC_OFS + INSN_SIZE)
98	add pc, pc, tmp1
99	dispatch_step `7`
100	dispatch_step `6`
101	dispatch_step `5`
102	dispatch_step `4`
103	dispatch_step `3`
104	dispatch_step `2`
105	dispatch_step `1`
106	.purgem dispatch_step
107	.endm
108
109	.macro dispatch_15_word
110	rsb tmp1, tmp1, #((`15` * `4`) - PC_OFS/`2` + INSN_SIZE/`2`)
111	add pc, pc, tmp1, lsl #`1`
112	dispatch_step `15`
113	dispatch_step `14`
114	dispatch_step `13`
115	dispatch_step `12`
116	dispatch_step `11`
117	dispatch_step `10`
118	dispatch_step `9`
119	dispatch_step `8`
120	dispatch_step `7`
121	dispatch_step `6`
122	dispatch_step `5`
123	dispatch_step `4`
124	dispatch_step `3`
125	dispatch_step `2`
126	dispatch_step `1`
127	.purgem dispatch_step
128	.endm
129	#else
130	# if ARM_BX_ALIGN_LOG2 < 3
131	# error case not handled
132	# endif
133	.macro dispatch_helper steps, log2_bytes_per_step
134	/ TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is*
135	(STEPS << LOG2_BYTES_PER_STEP).
136	So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
137	Then it needs further adjustment to compensate for the
138	distance between the PC value taken below (0f + PC_OFS)
139	and the first step's instructions (1f). /*
140	rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
141	+ ((`1f` - PC_OFS - `0f`) \
142	>> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
143	/ Shifting down LOG2_BYTES_PER_STEP gives us the number of*
144	steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
145	the (byte) distance to add to the PC. /*
146	`0`: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
147	bx tmp1
148	.p2align ARM_BX_ALIGN_LOG2
149	`1`:
150	.endm
151
152	.macro dispatch_7_dword
153	dispatch_helper `7`, `3`
154	.p2align ARM_BX_ALIGN_LOG2
155	dispatch_step `7`
156	.p2align ARM_BX_ALIGN_LOG2
157	dispatch_step `6`
158	.p2align ARM_BX_ALIGN_LOG2
159	dispatch_step `5`
160	.p2align ARM_BX_ALIGN_LOG2
161	dispatch_step `4`
162	.p2align ARM_BX_ALIGN_LOG2
163	dispatch_step `3`
164	.p2align ARM_BX_ALIGN_LOG2
165	dispatch_step `2`
166	.p2align ARM_BX_ALIGN_LOG2
167	dispatch_step `1`
168	.p2align ARM_BX_ALIGN_LOG2
169	.purgem dispatch_step
170	.endm
171
172	.macro dispatch_15_word
173	dispatch_helper `15`, `2`
174	dispatch_step `15`
175	.p2align ARM_BX_ALIGN_LOG2
176	dispatch_step `14`
177	.p2align ARM_BX_ALIGN_LOG2
178	dispatch_step `13`
179	.p2align ARM_BX_ALIGN_LOG2
180	dispatch_step `12`
181	.p2align ARM_BX_ALIGN_LOG2
182	dispatch_step `11`
183	.p2align ARM_BX_ALIGN_LOG2
184	dispatch_step `10`
185	.p2align ARM_BX_ALIGN_LOG2
186	dispatch_step `9`
187	.p2align ARM_BX_ALIGN_LOG2
188	dispatch_step `8`
189	.p2align ARM_BX_ALIGN_LOG2
190	dispatch_step `7`
191	.p2align ARM_BX_ALIGN_LOG2
192	dispatch_step `6`
193	.p2align ARM_BX_ALIGN_LOG2
194	dispatch_step `5`
195	.p2align ARM_BX_ALIGN_LOG2
196	dispatch_step `4`
197	.p2align ARM_BX_ALIGN_LOG2
198	dispatch_step `3`
199	.p2align ARM_BX_ALIGN_LOG2
200	dispatch_step `2`
201	.p2align ARM_BX_ALIGN_LOG2
202	dispatch_step `1`
203	.p2align ARM_BX_ALIGN_LOG2
204	.purgem dispatch_step
205	.endm
206
207	#endif
208
209	#ifndef USE_NEON
210	/ For bulk copies using GP registers. /
211	#define A_l r2 /* Call-clobbered. */
212	#define A_h r3 /* Call-clobbered. */
213	#define B_l r4
214	#define B_h r5
215	#define C_l r6
216	#define C_h r7
217	/ Don't use the pair r8,r9 because in some EABI variants r9 is reserved. /
218	#define D_l r10
219	#define D_h r11
220	#endif
221
222	/ Number of lines ahead to pre-fetch data. If you change this the code*
223	below will need adjustment to compensate. /*
224
225	#define prefetch_lines 5
226
227	#ifdef USE_VFP
228	.macro cpy_line_vfp vreg, base
229	vstr \vreg, [dst, #\base]
230	vldr \vreg, [src, #\base]
231	vstr d0, [dst, #\base + `8`]
232	vldr d0, [src, #\base + `8`]
233	vstr d1, [dst, #\base + `16`]
234	vldr d1, [src, #\base + `16`]
235	vstr d2, [dst, #\base + `24`]
236	vldr d2, [src, #\base + `24`]
237	vstr \vreg, [dst, #\base + `32`]
238	vldr \vreg, [src, #\base + prefetch_lines * `64` - `32`]
239	vstr d0, [dst, #\base + `40`]
240	vldr d0, [src, #\base + `40`]
241	vstr d1, [dst, #\base + `48`]
242	vldr d1, [src, #\base + `48`]
243	vstr d2, [dst, #\base + `56`]
244	vldr d2, [src, #\base + `56`]
245	.endm
246
247	.macro cpy_tail_vfp vreg, base
248	vstr \vreg, [dst, #\base]
249	vldr \vreg, [src, #\base]
250	vstr d0, [dst, #\base + `8`]
251	vldr d0, [src, #\base + `8`]
252	vstr d1, [dst, #\base + `16`]
253	vldr d1, [src, #\base + `16`]
254	vstr d2, [dst, #\base + `24`]
255	vldr d2, [src, #\base + `24`]
256	vstr \vreg, [dst, #\base + `32`]
257	vstr d0, [dst, #\base + `40`]
258	vldr d0, [src, #\base + `40`]
259	vstr d1, [dst, #\base + `48`]
260	vldr d1, [src, #\base + `48`]
261	vstr d2, [dst, #\base + `56`]
262	vldr d2, [src, #\base + `56`]
263	.endm
264	#endif
265
266	.p2align `6`
267	ENTRY(memcpy)
268
269	mov dst, dstin / Preserve dstin, we need to return it. /
270	cmp count, #`64`
271	bhs .Lcpy_not_short
272	/ Deal with small copies quickly by dropping straight into the*
273	exit block. /*
274
275	.Ltail63unaligned:
276	#ifdef USE_NEON
277	/ These need an extra layer of macro just to work around a*
278	bug in the assembler's parser when an operand starts with
279	a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647
280	tracks that bug; it was not fixed as of binutils-2.23.2. /*
281	.macro neon_load_d0 reg
282	vld1`.8` {d0}, [\reg]!
283	.endm
284	.macro neon_store_d0 reg
285	vst1`.8` {d0}, [\reg]!
286	.endm
287
288	and tmp1, count, #`0x38`
289	.macro dispatch_step i
290	neon_load_d0 src
291	neon_store_d0 dst
292	.endm
293	dispatch_7_dword
294
295	tst count, #`4`
296	ldrne tmp1, [src], #`4`
297	strne tmp1, [dst], #`4`
298	#else
299	/ Copy up to 15 full words of data. May not be aligned. /
300	/ Cannot use VFP for unaligned data. /
301	and tmp1, count, #`0x3c`
302	add dst, dst, tmp1
303	add src, src, tmp1
304	/ Jump directly into the sequence below at the correct offset. /
305	.macro dispatch_step i
306	ldr tmp1, [src, #-(\i * `4`)]
307	str tmp1, [dst, #-(\i * `4`)]
308	.endm
309	dispatch_15_word
310	#endif
311
312	lsls count, count, #`31`
313	ldrhcs tmp1, [src], #`2`
314	ldrbne src, [src] / Src is dead, use as a scratch. /
315	strhcs tmp1, [dst], #`2`
316	strbne src, [dst]
317	bx lr
318
319	.Lcpy_not_short:
320	/ At least 64 bytes to copy, but don't know the alignment yet. /
321	str tmp2, [sp, #-FRAME_SIZE]!
322	cfi_adjust_cfa_offset (FRAME_SIZE)
323	cfi_rel_offset (tmp2, `0`)
324	cfi_remember_state
325	and tmp2, src, #`7`
326	and tmp1, dst, #`7`
327	cmp tmp1, tmp2
328	bne .Lcpy_notaligned
329
330	#ifdef USE_VFP
331	/ Magic dust alert! Force VFP on Cortex-A9. Experiments show*
332	that the FP pipeline is much better at streaming loads and
333	stores. This is outside the critical loop. /*
334	vmov.f32 s0, s0
335	#endif
336
337	/ SRC and DST have the same mutual 64-bit alignment, but we may*
338	still need to pre-copy some bytes to get to natural alignment.
339	We bring SRC and DST into full 64-bit alignment. /*
340	lsls tmp2, dst, #`29`
341	beq `1f`
342	rsbs tmp2, tmp2, #`0`
343	sub count, count, tmp2, lsr #`29`
344	ldrmi tmp1, [src], #`4`
345	strmi tmp1, [dst], #`4`
346	lsls tmp2, tmp2, #`2`
347	ldrhcs tmp1, [src], #`2`
348	ldrbne tmp2, [src], #`1`
349	strhcs tmp1, [dst], #`2`
350	strbne tmp2, [dst], #`1`
351
352	`1`:
353	subs tmp2, count, #`64` / Use tmp2 for count. /
354	blo .Ltail63aligned
355
356	cmp tmp2, #`512`
357	bhs .Lcpy_body_long
358
359	.Lcpy_body_medium: / Count in tmp2. /
360	#ifdef USE_VFP
361	`1`:
362	vldr d0, [src, #`0`]
363	subs tmp2, tmp2, #`64`
364	vldr d1, [src, #`8`]
365	vstr d0, [dst, #`0`]
366	vldr d0, [src, #`16`]
367	vstr d1, [dst, #`8`]
368	vldr d1, [src, #`24`]
369	vstr d0, [dst, #`16`]
370	vldr d0, [src, #`32`]
371	vstr d1, [dst, #`24`]
372	vldr d1, [src, #`40`]
373	vstr d0, [dst, #`32`]
374	vldr d0, [src, #`48`]
375	vstr d1, [dst, #`40`]
376	vldr d1, [src, #`56`]
377	vstr d0, [dst, #`48`]
378	add src, src, #`64`
379	vstr d1, [dst, #`56`]
380	add dst, dst, #`64`
381	bhs `1b`
382	tst tmp2, #`0x3f`
383	beq .Ldone
384
385	.Ltail63aligned: / Count in tmp2. /
386	and tmp1, tmp2, #`0x38`
387	add dst, dst, tmp1
388	add src, src, tmp1
389	.macro dispatch_step i
390	vldr d0, [src, #-(\i * `8`)]
391	vstr d0, [dst, #-(\i * `8`)]
392	.endm
393	dispatch_7_dword
394	#else
395	sub src, src, #`8`
396	sub dst, dst, #`8`
397	`1`:
398	ldrd A_l, A_h, [src, #`8`]
399	strd A_l, A_h, [dst, #`8`]
400	ldrd A_l, A_h, [src, #`16`]
401	strd A_l, A_h, [dst, #`16`]
402	ldrd A_l, A_h, [src, #`24`]
403	strd A_l, A_h, [dst, #`24`]
404	ldrd A_l, A_h, [src, #`32`]
405	strd A_l, A_h, [dst, #`32`]
406	ldrd A_l, A_h, [src, #`40`]
407	strd A_l, A_h, [dst, #`40`]
408	ldrd A_l, A_h, [src, #`48`]
409	strd A_l, A_h, [dst, #`48`]
410	ldrd A_l, A_h, [src, #`56`]
411	strd A_l, A_h, [dst, #`56`]
412	ldrd A_l, A_h, [src, #`64`]!
413	strd A_l, A_h, [dst, #`64`]!
414	subs tmp2, tmp2, #`64`
415	bhs `1b`
416	tst tmp2, #`0x3f`
417	bne `1f`
418	ldr tmp2,[sp], #FRAME_SIZE
419	cfi_adjust_cfa_offset (-FRAME_SIZE)
420	cfi_restore (tmp2)
421	bx lr
422
423	cfi_restore_state
424	cfi_remember_state
425	`1`:
426	add src, src, #`8`
427	add dst, dst, #`8`
428
429	.Ltail63aligned: / Count in tmp2. /
430	/ Copy up to 7 d-words of data. Similar to Ltail63unaligned, but*
431	we know that the src and dest are 64-bit aligned so we can use
432	LDRD/STRD to improve efficiency. /*
433	/ TMP2 is now negative, but we don't care about that. The bottom*
434	six bits still tell us how many bytes are left to copy. /*
435
436	and tmp1, tmp2, #`0x38`
437	add dst, dst, tmp1
438	add src, src, tmp1
439	.macro dispatch_step i
440	ldrd A_l, A_h, [src, #-(\i * `8`)]
441	strd A_l, A_h, [dst, #-(\i * `8`)]
442	.endm
443	dispatch_7_dword
444	#endif
445
446	tst tmp2, #`4`
447	ldrne tmp1, [src], #`4`
448	strne tmp1, [dst], #`4`
449	lsls tmp2, tmp2, #`31` / Count (tmp2) now dead. /
450	ldrhcs tmp1, [src], #`2`
451	ldrbne tmp2, [src]
452	strhcs tmp1, [dst], #`2`
453	strbne tmp2, [dst]
454
455	.Ldone:
456	ldr tmp2, [sp], #FRAME_SIZE
457	cfi_adjust_cfa_offset (-FRAME_SIZE)
458	cfi_restore (tmp2)
459	bx lr
460
461	cfi_restore_state
462	cfi_remember_state
463
464	.Lcpy_body_long: / Count in tmp2. /
465
466	/ Long copy. We know that there's at least (prefetch_lines * 64)*
467	bytes to go. /*
468	#ifdef USE_VFP
469	/ Don't use PLD. Instead, read some data in advance of the current*
470	copy position into a register. This should act like a PLD
471	operation but we won't have to repeat the transfer. /*
472
473	vldr d3, [src, #`0`]
474	vldr d4, [src, #`64`]
475	vldr d5, [src, #`128`]
476	vldr d6, [src, #`192`]
477	vldr d7, [src, #`256`]
478
479	vldr d0, [src, #`8`]
480	vldr d1, [src, #`16`]
481	vldr d2, [src, #`24`]
482	add src, src, #`32`
483
484	subs tmp2, tmp2, #prefetch_lines * `64` * `2`
485	blo `2f`
486	`1`:
487	cpy_line_vfp d3, `0`
488	cpy_line_vfp d4, `64`
489	cpy_line_vfp d5, `128`
490	add dst, dst, #`3` * `64`
491	add src, src, #`3` * `64`
492	cpy_line_vfp d6, `0`
493	cpy_line_vfp d7, `64`
494	add dst, dst, #`2` * `64`
495	add src, src, #`2` * `64`
496	subs tmp2, tmp2, #prefetch_lines * `64`
497	bhs `1b`
498
499	`2`:
500	cpy_tail_vfp d3, `0`
501	cpy_tail_vfp d4, `64`
502	cpy_tail_vfp d5, `128`
503	add src, src, #`3` * `64`
504	add dst, dst, #`3` * `64`
505	cpy_tail_vfp d6, `0`
506	vstr d7, [dst, #`64`]
507	vldr d7, [src, #`64`]
508	vstr d0, [dst, #`64` + `8`]
509	vldr d0, [src, #`64` + `8`]
510	vstr d1, [dst, #`64` + `16`]
511	vldr d1, [src, #`64` + `16`]
512	vstr d2, [dst, #`64` + `24`]
513	vldr d2, [src, #`64` + `24`]
514	vstr d7, [dst, #`64` + `32`]
515	add src, src, #`96`
516	vstr d0, [dst, #`64` + `40`]
517	vstr d1, [dst, #`64` + `48`]
518	vstr d2, [dst, #`64` + `56`]
519	add dst, dst, #`128`
520	add tmp2, tmp2, #prefetch_lines * `64`
521	b .Lcpy_body_medium
522	#else
523	/ Long copy. Use an SMS style loop to maximize the I/O*
524	bandwidth of the core. We don't have enough spare registers
525	to synthesise prefetching, so use PLD operations. /*
526	/ Pre-bias src and dst. /
527	sub src, src, #`8`
528	sub dst, dst, #`8`
529	pld [src, #`8`]
530	pld [src, #`72`]
531	subs tmp2, tmp2, #`64`
532	pld [src, #`136`]
533	ldrd A_l, A_h, [src, #`8`]
534	strd B_l, B_h, [sp, #`8`]
535	cfi_rel_offset (B_l, `8`)
536	cfi_rel_offset (B_h, `12`)
537	ldrd B_l, B_h, [src, #`16`]
538	strd C_l, C_h, [sp, #`16`]
539	cfi_rel_offset (C_l, `16`)
540	cfi_rel_offset (C_h, `20`)
541	ldrd C_l, C_h, [src, #`24`]
542	strd D_l, D_h, [sp, #`24`]
543	cfi_rel_offset (D_l, `24`)
544	cfi_rel_offset (D_h, `28`)
545	pld [src, #`200`]
546	ldrd D_l, D_h, [src, #`32`]!
547	b `1f`
548	.p2align `6`
549	`2`:
550	pld [src, #`232`]
551	strd A_l, A_h, [dst, #`40`]
552	ldrd A_l, A_h, [src, #`40`]
553	strd B_l, B_h, [dst, #`48`]
554	ldrd B_l, B_h, [src, #`48`]
555	strd C_l, C_h, [dst, #`56`]
556	ldrd C_l, C_h, [src, #`56`]
557	strd D_l, D_h, [dst, #`64`]!
558	ldrd D_l, D_h, [src, #`64`]!
559	subs tmp2, tmp2, #`64`
560	`1`:
561	strd A_l, A_h, [dst, #`8`]
562	ldrd A_l, A_h, [src, #`8`]
563	strd B_l, B_h, [dst, #`16`]
564	ldrd B_l, B_h, [src, #`16`]
565	strd C_l, C_h, [dst, #`24`]
566	ldrd C_l, C_h, [src, #`24`]
567	strd D_l, D_h, [dst, #`32`]
568	ldrd D_l, D_h, [src, #`32`]
569	bcs `2b`
570	/ Save the remaining bytes and restore the callee-saved regs. /
571	strd A_l, A_h, [dst, #`40`]
572	add src, src, #`40`
573	strd B_l, B_h, [dst, #`48`]
574	ldrd B_l, B_h, [sp, #`8`]
575	cfi_restore (B_l)
576	cfi_restore (B_h)
577	strd C_l, C_h, [dst, #`56`]
578	ldrd C_l, C_h, [sp, #`16`]
579	cfi_restore (C_l)
580	cfi_restore (C_h)
581	strd D_l, D_h, [dst, #`64`]
582	ldrd D_l, D_h, [sp, #`24`]
583	cfi_restore (D_l)
584	cfi_restore (D_h)
585	add dst, dst, #`72`
586	tst tmp2, #`0x3f`
587	bne .Ltail63aligned
588	ldr tmp2, [sp], #FRAME_SIZE
589	cfi_adjust_cfa_offset (-FRAME_SIZE)
590	cfi_restore (tmp2)
591	bx lr
592	#endif
593
594	cfi_restore_state
595	cfi_remember_state
596
597	.Lcpy_notaligned:
598	pld [src, #`0`]
599	pld [src, #`64`]
600	/ There's at least 64 bytes to copy, but there is no mutual*
601	alignment. /*
602	/ Bring DST to 64-bit alignment. /
603	lsls tmp2, dst, #`29`
604	pld [src, #(`2` * `64`)]
605	beq `1f`
606	rsbs tmp2, tmp2, #`0`
607	sub count, count, tmp2, lsr #`29`
608	ldrmi tmp1, [src], #`4`
609	strmi tmp1, [dst], #`4`
610	lsls tmp2, tmp2, #`2`
611	ldrbne tmp1, [src], #`1`
612	ldrhcs tmp2, [src], #`2`
613	strbne tmp1, [dst], #`1`
614	strhcs tmp2, [dst], #`2`
615	`1`:
616	pld [src, #(`3` * `64`)]
617	subs count, count, #`64`
618	ldrlo tmp2, [sp], #FRAME_SIZE
619	blo .Ltail63unaligned
620	pld [src, #(`4` * `64`)]
621
622	#ifdef USE_NEON
623	/ These need an extra layer of macro just to work around a*
624	bug in the assembler's parser when an operand starts with
625	a {...}. /*
626	.macro neon_load_multi reglist, basereg
627	vld1`.8` {\reglist}, [\basereg]!
628	.endm
629	.macro neon_store_multi reglist, basereg
630	vst1`.8` {\reglist}, [ALIGN (\basereg, `64`)]!
631	.endm
632
633	neon_load_multi d0-d3, src
634	neon_load_multi d4-d7, src
635	subs count, count, #`64`
636	blo `2f`
637	`1`:
638	pld [src, #(`4` * `64`)]
639	neon_store_multi d0-d3, dst
640	neon_load_multi d0-d3, src
641	neon_store_multi d4-d7, dst
642	neon_load_multi d4-d7, src
643	subs count, count, #`64`
644	bhs `1b`
645	`2`:
646	neon_store_multi d0-d3, dst
647	neon_store_multi d4-d7, dst
648	ands count, count, #`0x3f`
649	#else
650	/ Use an SMS style loop to maximize the I/O bandwidth. /
651	sub src, src, #`4`
652	sub dst, dst, #`8`
653	subs tmp2, count, #`64` / Use tmp2 for count. /
654	ldr A_l, [src, #`4`]
655	ldr A_h, [src, #`8`]
656	strd B_l, B_h, [sp, #`8`]
657	cfi_rel_offset (B_l, `8`)
658	cfi_rel_offset (B_h, `12`)
659	ldr B_l, [src, #`12`]
660	ldr B_h, [src, #`16`]
661	strd C_l, C_h, [sp, #`16`]
662	cfi_rel_offset (C_l, `16`)
663	cfi_rel_offset (C_h, `20`)
664	ldr C_l, [src, #`20`]
665	ldr C_h, [src, #`24`]
666	strd D_l, D_h, [sp, #`24`]
667	cfi_rel_offset (D_l, `24`)
668	cfi_rel_offset (D_h, `28`)
669	ldr D_l, [src, #`28`]
670	ldr D_h, [src, #`32`]!
671	b `1f`
672	.p2align `6`
673	`2`:
674	pld [src, #(`5` * `64`) - (`32` - `4`)]
675	strd A_l, A_h, [dst, #`40`]
676	ldr A_l, [src, #`36`]
677	ldr A_h, [src, #`40`]
678	strd B_l, B_h, [dst, #`48`]
679	ldr B_l, [src, #`44`]
680	ldr B_h, [src, #`48`]
681	strd C_l, C_h, [dst, #`56`]
682	ldr C_l, [src, #`52`]
683	ldr C_h, [src, #`56`]
684	strd D_l, D_h, [dst, #`64`]!
685	ldr D_l, [src, #`60`]
686	ldr D_h, [src, #`64`]!
687	subs tmp2, tmp2, #`64`
688	`1`:
689	strd A_l, A_h, [dst, #`8`]
690	ldr A_l, [src, #`4`]
691	ldr A_h, [src, #`8`]
692	strd B_l, B_h, [dst, #`16`]
693	ldr B_l, [src, #`12`]
694	ldr B_h, [src, #`16`]
695	strd C_l, C_h, [dst, #`24`]
696	ldr C_l, [src, #`20`]
697	ldr C_h, [src, #`24`]
698	strd D_l, D_h, [dst, #`32`]
699	ldr D_l, [src, #`28`]
700	ldr D_h, [src, #`32`]
701	bcs `2b`
702
703	/ Save the remaining bytes and restore the callee-saved regs. /
704	strd A_l, A_h, [dst, #`40`]
705	add src, src, #`36`
706	strd B_l, B_h, [dst, #`48`]
707	ldrd B_l, B_h, [sp, #`8`]
708	cfi_restore (B_l)
709	cfi_restore (B_h)
710	strd C_l, C_h, [dst, #`56`]
711	ldrd C_l, C_h, [sp, #`16`]
712	cfi_restore (C_l)
713	cfi_restore (C_h)
714	strd D_l, D_h, [dst, #`64`]
715	ldrd D_l, D_h, [sp, #`24`]
716	cfi_restore (D_l)
717	cfi_restore (D_h)
718	add dst, dst, #`72`
719	ands count, tmp2, #`0x3f`
720	#endif
721	ldr tmp2, [sp], #FRAME_SIZE
722	cfi_adjust_cfa_offset (-FRAME_SIZE)
723	cfi_restore (tmp2)
724	bne .Ltail63unaligned
725	bx lr
726
727	END(memcpy)
728	libc_hidden_builtin_def (memcpy)
729

source code of glibc/sysdeps/arm/armv7/multiarch/memcpy_impl.S