1/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
2 Copyright (C) 2013-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>.
18
19 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
20 of VFP or NEON when built with the appropriate flags.
21
22 Assumptions:
23
24 ARMv6 (ARMv7-a if using Neon)
25 ARM state
26 Unaligned accesses
27
28 */
29
30/* Thumb cannot encode negative immediate offsets in memory operations. */
31#ifndef NO_THUMB
32#define NO_THUMB
33#endif
34#include <sysdep.h>
35#include <arm-features.h>
36
37 .syntax unified
38 /* This implementation requires ARM state. */
39 .arm
40
41#ifdef MEMCPY_NEON
42
43 .fpu neon
44 .arch armv7-a
45# define FRAME_SIZE 4
46# define USE_VFP
47# define USE_NEON
48
49#elif defined (MEMCPY_VFP)
50
51 .arch armv6
52 .fpu vfpv2
53# define FRAME_SIZE 32
54# define USE_VFP
55
56#else
57 .arch armv6
58# define FRAME_SIZE 32
59
60#endif
61
62#define ALIGN(addr, align) addr:align
63
64#define INSN_SIZE 4
65
66/* Call parameters. */
67#define dstin r0
68#define src r1
69#define count r2
70
71/* Locals. */
72#define tmp1 r3
73#define dst ip
74#define tmp2 r8
75
76/* These two macros both work by repeated invocation of the macro
77 dispatch_step (not defined here). That macro performs one "step",
78 doing one load instruction and one store instruction to copy one
79 "unit". On entry, TMP1 contains the number of bytes to be copied,
80 a multiple of the unit size. The macro clobbers TMP1 in the
81 process of doing a computed jump to the tail containing the
82 appropriate number of steps.
83
84 In dispatch_7_dword, dispatch_step is invoked seven times, with an
85 argument that is 7 for the first and 1 for the last. Units are
86 double-words (8 bytes). TMP1 is at most 56.
87
88 In dispatch_15_word, dispatch_step is invoked fifteen times,
89 with an argument that is 15 for the first and 1 for the last.
90 Units are words (4 bytes). TMP1 is at most 60. */
91
92#ifndef ARM_ALWAYS_BX
93# if ARM_BX_ALIGN_LOG2 != 2
94# error case not handled
95# endif
96 .macro dispatch_7_dword
97 rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
98 add pc, pc, tmp1
99 dispatch_step 7
100 dispatch_step 6
101 dispatch_step 5
102 dispatch_step 4
103 dispatch_step 3
104 dispatch_step 2
105 dispatch_step 1
106 .purgem dispatch_step
107 .endm
108
109 .macro dispatch_15_word
110 rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
111 add pc, pc, tmp1, lsl #1
112 dispatch_step 15
113 dispatch_step 14
114 dispatch_step 13
115 dispatch_step 12
116 dispatch_step 11
117 dispatch_step 10
118 dispatch_step 9
119 dispatch_step 8
120 dispatch_step 7
121 dispatch_step 6
122 dispatch_step 5
123 dispatch_step 4
124 dispatch_step 3
125 dispatch_step 2
126 dispatch_step 1
127 .purgem dispatch_step
128 .endm
129#else
130# if ARM_BX_ALIGN_LOG2 < 3
131# error case not handled
132# endif
133 .macro dispatch_helper steps, log2_bytes_per_step
134 /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
135 (STEPS << LOG2_BYTES_PER_STEP).
136 So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
137 Then it needs further adjustment to compensate for the
138 distance between the PC value taken below (0f + PC_OFS)
139 and the first step's instructions (1f). */
140 rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
141 + ((1f - PC_OFS - 0f) \
142 >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
143 /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
144 steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
145 the (byte) distance to add to the PC. */
1460: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
147 bx tmp1
148 .p2align ARM_BX_ALIGN_LOG2
1491:
150 .endm
151
152 .macro dispatch_7_dword
153 dispatch_helper 7, 3
154 .p2align ARM_BX_ALIGN_LOG2
155 dispatch_step 7
156 .p2align ARM_BX_ALIGN_LOG2
157 dispatch_step 6
158 .p2align ARM_BX_ALIGN_LOG2
159 dispatch_step 5
160 .p2align ARM_BX_ALIGN_LOG2
161 dispatch_step 4
162 .p2align ARM_BX_ALIGN_LOG2
163 dispatch_step 3
164 .p2align ARM_BX_ALIGN_LOG2
165 dispatch_step 2
166 .p2align ARM_BX_ALIGN_LOG2
167 dispatch_step 1
168 .p2align ARM_BX_ALIGN_LOG2
169 .purgem dispatch_step
170 .endm
171
172 .macro dispatch_15_word
173 dispatch_helper 15, 2
174 dispatch_step 15
175 .p2align ARM_BX_ALIGN_LOG2
176 dispatch_step 14
177 .p2align ARM_BX_ALIGN_LOG2
178 dispatch_step 13
179 .p2align ARM_BX_ALIGN_LOG2
180 dispatch_step 12
181 .p2align ARM_BX_ALIGN_LOG2
182 dispatch_step 11
183 .p2align ARM_BX_ALIGN_LOG2
184 dispatch_step 10
185 .p2align ARM_BX_ALIGN_LOG2
186 dispatch_step 9
187 .p2align ARM_BX_ALIGN_LOG2
188 dispatch_step 8
189 .p2align ARM_BX_ALIGN_LOG2
190 dispatch_step 7
191 .p2align ARM_BX_ALIGN_LOG2
192 dispatch_step 6
193 .p2align ARM_BX_ALIGN_LOG2
194 dispatch_step 5
195 .p2align ARM_BX_ALIGN_LOG2
196 dispatch_step 4
197 .p2align ARM_BX_ALIGN_LOG2
198 dispatch_step 3
199 .p2align ARM_BX_ALIGN_LOG2
200 dispatch_step 2
201 .p2align ARM_BX_ALIGN_LOG2
202 dispatch_step 1
203 .p2align ARM_BX_ALIGN_LOG2
204 .purgem dispatch_step
205 .endm
206
207#endif
208
209#ifndef USE_NEON
210/* For bulk copies using GP registers. */
211#define A_l r2 /* Call-clobbered. */
212#define A_h r3 /* Call-clobbered. */
213#define B_l r4
214#define B_h r5
215#define C_l r6
216#define C_h r7
217/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */
218#define D_l r10
219#define D_h r11
220#endif
221
222/* Number of lines ahead to pre-fetch data. If you change this the code
223 below will need adjustment to compensate. */
224
225#define prefetch_lines 5
226
227#ifdef USE_VFP
228 .macro cpy_line_vfp vreg, base
229 vstr \vreg, [dst, #\base]
230 vldr \vreg, [src, #\base]
231 vstr d0, [dst, #\base + 8]
232 vldr d0, [src, #\base + 8]
233 vstr d1, [dst, #\base + 16]
234 vldr d1, [src, #\base + 16]
235 vstr d2, [dst, #\base + 24]
236 vldr d2, [src, #\base + 24]
237 vstr \vreg, [dst, #\base + 32]
238 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
239 vstr d0, [dst, #\base + 40]
240 vldr d0, [src, #\base + 40]
241 vstr d1, [dst, #\base + 48]
242 vldr d1, [src, #\base + 48]
243 vstr d2, [dst, #\base + 56]
244 vldr d2, [src, #\base + 56]
245 .endm
246
247 .macro cpy_tail_vfp vreg, base
248 vstr \vreg, [dst, #\base]
249 vldr \vreg, [src, #\base]
250 vstr d0, [dst, #\base + 8]
251 vldr d0, [src, #\base + 8]
252 vstr d1, [dst, #\base + 16]
253 vldr d1, [src, #\base + 16]
254 vstr d2, [dst, #\base + 24]
255 vldr d2, [src, #\base + 24]
256 vstr \vreg, [dst, #\base + 32]
257 vstr d0, [dst, #\base + 40]
258 vldr d0, [src, #\base + 40]
259 vstr d1, [dst, #\base + 48]
260 vldr d1, [src, #\base + 48]
261 vstr d2, [dst, #\base + 56]
262 vldr d2, [src, #\base + 56]
263 .endm
264#endif
265
266 .p2align 6
267ENTRY(memcpy)
268
269 mov dst, dstin /* Preserve dstin, we need to return it. */
270 cmp count, #64
271 bhs .Lcpy_not_short
272 /* Deal with small copies quickly by dropping straight into the
273 exit block. */
274
275.Ltail63unaligned:
276#ifdef USE_NEON
277 /* These need an extra layer of macro just to work around a
278 bug in the assembler's parser when an operand starts with
279 a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647
280 tracks that bug; it was not fixed as of binutils-2.23.2. */
281 .macro neon_load_d0 reg
282 vld1.8 {d0}, [\reg]!
283 .endm
284 .macro neon_store_d0 reg
285 vst1.8 {d0}, [\reg]!
286 .endm
287
288 and tmp1, count, #0x38
289 .macro dispatch_step i
290 neon_load_d0 src
291 neon_store_d0 dst
292 .endm
293 dispatch_7_dword
294
295 tst count, #4
296 ldrne tmp1, [src], #4
297 strne tmp1, [dst], #4
298#else
299 /* Copy up to 15 full words of data. May not be aligned. */
300 /* Cannot use VFP for unaligned data. */
301 and tmp1, count, #0x3c
302 add dst, dst, tmp1
303 add src, src, tmp1
304 /* Jump directly into the sequence below at the correct offset. */
305 .macro dispatch_step i
306 ldr tmp1, [src, #-(\i * 4)]
307 str tmp1, [dst, #-(\i * 4)]
308 .endm
309 dispatch_15_word
310#endif
311
312 lsls count, count, #31
313 ldrhcs tmp1, [src], #2
314 ldrbne src, [src] /* Src is dead, use as a scratch. */
315 strhcs tmp1, [dst], #2
316 strbne src, [dst]
317 bx lr
318
319.Lcpy_not_short:
320 /* At least 64 bytes to copy, but don't know the alignment yet. */
321 str tmp2, [sp, #-FRAME_SIZE]!
322 cfi_adjust_cfa_offset (FRAME_SIZE)
323 cfi_rel_offset (tmp2, 0)
324 cfi_remember_state
325 and tmp2, src, #7
326 and tmp1, dst, #7
327 cmp tmp1, tmp2
328 bne .Lcpy_notaligned
329
330#ifdef USE_VFP
331 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
332 that the FP pipeline is much better at streaming loads and
333 stores. This is outside the critical loop. */
334 vmov.f32 s0, s0
335#endif
336
337 /* SRC and DST have the same mutual 64-bit alignment, but we may
338 still need to pre-copy some bytes to get to natural alignment.
339 We bring SRC and DST into full 64-bit alignment. */
340 lsls tmp2, dst, #29
341 beq 1f
342 rsbs tmp2, tmp2, #0
343 sub count, count, tmp2, lsr #29
344 ldrmi tmp1, [src], #4
345 strmi tmp1, [dst], #4
346 lsls tmp2, tmp2, #2
347 ldrhcs tmp1, [src], #2
348 ldrbne tmp2, [src], #1
349 strhcs tmp1, [dst], #2
350 strbne tmp2, [dst], #1
351
3521:
353 subs tmp2, count, #64 /* Use tmp2 for count. */
354 blo .Ltail63aligned
355
356 cmp tmp2, #512
357 bhs .Lcpy_body_long
358
359.Lcpy_body_medium: /* Count in tmp2. */
360#ifdef USE_VFP
3611:
362 vldr d0, [src, #0]
363 subs tmp2, tmp2, #64
364 vldr d1, [src, #8]
365 vstr d0, [dst, #0]
366 vldr d0, [src, #16]
367 vstr d1, [dst, #8]
368 vldr d1, [src, #24]
369 vstr d0, [dst, #16]
370 vldr d0, [src, #32]
371 vstr d1, [dst, #24]
372 vldr d1, [src, #40]
373 vstr d0, [dst, #32]
374 vldr d0, [src, #48]
375 vstr d1, [dst, #40]
376 vldr d1, [src, #56]
377 vstr d0, [dst, #48]
378 add src, src, #64
379 vstr d1, [dst, #56]
380 add dst, dst, #64
381 bhs 1b
382 tst tmp2, #0x3f
383 beq .Ldone
384
385.Ltail63aligned: /* Count in tmp2. */
386 and tmp1, tmp2, #0x38
387 add dst, dst, tmp1
388 add src, src, tmp1
389 .macro dispatch_step i
390 vldr d0, [src, #-(\i * 8)]
391 vstr d0, [dst, #-(\i * 8)]
392 .endm
393 dispatch_7_dword
394#else
395 sub src, src, #8
396 sub dst, dst, #8
3971:
398 ldrd A_l, A_h, [src, #8]
399 strd A_l, A_h, [dst, #8]
400 ldrd A_l, A_h, [src, #16]
401 strd A_l, A_h, [dst, #16]
402 ldrd A_l, A_h, [src, #24]
403 strd A_l, A_h, [dst, #24]
404 ldrd A_l, A_h, [src, #32]
405 strd A_l, A_h, [dst, #32]
406 ldrd A_l, A_h, [src, #40]
407 strd A_l, A_h, [dst, #40]
408 ldrd A_l, A_h, [src, #48]
409 strd A_l, A_h, [dst, #48]
410 ldrd A_l, A_h, [src, #56]
411 strd A_l, A_h, [dst, #56]
412 ldrd A_l, A_h, [src, #64]!
413 strd A_l, A_h, [dst, #64]!
414 subs tmp2, tmp2, #64
415 bhs 1b
416 tst tmp2, #0x3f
417 bne 1f
418 ldr tmp2,[sp], #FRAME_SIZE
419 cfi_adjust_cfa_offset (-FRAME_SIZE)
420 cfi_restore (tmp2)
421 bx lr
422
423 cfi_restore_state
424 cfi_remember_state
4251:
426 add src, src, #8
427 add dst, dst, #8
428
429.Ltail63aligned: /* Count in tmp2. */
430 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
431 we know that the src and dest are 64-bit aligned so we can use
432 LDRD/STRD to improve efficiency. */
433 /* TMP2 is now negative, but we don't care about that. The bottom
434 six bits still tell us how many bytes are left to copy. */
435
436 and tmp1, tmp2, #0x38
437 add dst, dst, tmp1
438 add src, src, tmp1
439 .macro dispatch_step i
440 ldrd A_l, A_h, [src, #-(\i * 8)]
441 strd A_l, A_h, [dst, #-(\i * 8)]
442 .endm
443 dispatch_7_dword
444#endif
445
446 tst tmp2, #4
447 ldrne tmp1, [src], #4
448 strne tmp1, [dst], #4
449 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
450 ldrhcs tmp1, [src], #2
451 ldrbne tmp2, [src]
452 strhcs tmp1, [dst], #2
453 strbne tmp2, [dst]
454
455.Ldone:
456 ldr tmp2, [sp], #FRAME_SIZE
457 cfi_adjust_cfa_offset (-FRAME_SIZE)
458 cfi_restore (tmp2)
459 bx lr
460
461 cfi_restore_state
462 cfi_remember_state
463
464.Lcpy_body_long: /* Count in tmp2. */
465
466 /* Long copy. We know that there's at least (prefetch_lines * 64)
467 bytes to go. */
468#ifdef USE_VFP
469 /* Don't use PLD. Instead, read some data in advance of the current
470 copy position into a register. This should act like a PLD
471 operation but we won't have to repeat the transfer. */
472
473 vldr d3, [src, #0]
474 vldr d4, [src, #64]
475 vldr d5, [src, #128]
476 vldr d6, [src, #192]
477 vldr d7, [src, #256]
478
479 vldr d0, [src, #8]
480 vldr d1, [src, #16]
481 vldr d2, [src, #24]
482 add src, src, #32
483
484 subs tmp2, tmp2, #prefetch_lines * 64 * 2
485 blo 2f
4861:
487 cpy_line_vfp d3, 0
488 cpy_line_vfp d4, 64
489 cpy_line_vfp d5, 128
490 add dst, dst, #3 * 64
491 add src, src, #3 * 64
492 cpy_line_vfp d6, 0
493 cpy_line_vfp d7, 64
494 add dst, dst, #2 * 64
495 add src, src, #2 * 64
496 subs tmp2, tmp2, #prefetch_lines * 64
497 bhs 1b
498
4992:
500 cpy_tail_vfp d3, 0
501 cpy_tail_vfp d4, 64
502 cpy_tail_vfp d5, 128
503 add src, src, #3 * 64
504 add dst, dst, #3 * 64
505 cpy_tail_vfp d6, 0
506 vstr d7, [dst, #64]
507 vldr d7, [src, #64]
508 vstr d0, [dst, #64 + 8]
509 vldr d0, [src, #64 + 8]
510 vstr d1, [dst, #64 + 16]
511 vldr d1, [src, #64 + 16]
512 vstr d2, [dst, #64 + 24]
513 vldr d2, [src, #64 + 24]
514 vstr d7, [dst, #64 + 32]
515 add src, src, #96
516 vstr d0, [dst, #64 + 40]
517 vstr d1, [dst, #64 + 48]
518 vstr d2, [dst, #64 + 56]
519 add dst, dst, #128
520 add tmp2, tmp2, #prefetch_lines * 64
521 b .Lcpy_body_medium
522#else
523 /* Long copy. Use an SMS style loop to maximize the I/O
524 bandwidth of the core. We don't have enough spare registers
525 to synthesise prefetching, so use PLD operations. */
526 /* Pre-bias src and dst. */
527 sub src, src, #8
528 sub dst, dst, #8
529 pld [src, #8]
530 pld [src, #72]
531 subs tmp2, tmp2, #64
532 pld [src, #136]
533 ldrd A_l, A_h, [src, #8]
534 strd B_l, B_h, [sp, #8]
535 cfi_rel_offset (B_l, 8)
536 cfi_rel_offset (B_h, 12)
537 ldrd B_l, B_h, [src, #16]
538 strd C_l, C_h, [sp, #16]
539 cfi_rel_offset (C_l, 16)
540 cfi_rel_offset (C_h, 20)
541 ldrd C_l, C_h, [src, #24]
542 strd D_l, D_h, [sp, #24]
543 cfi_rel_offset (D_l, 24)
544 cfi_rel_offset (D_h, 28)
545 pld [src, #200]
546 ldrd D_l, D_h, [src, #32]!
547 b 1f
548 .p2align 6
5492:
550 pld [src, #232]
551 strd A_l, A_h, [dst, #40]
552 ldrd A_l, A_h, [src, #40]
553 strd B_l, B_h, [dst, #48]
554 ldrd B_l, B_h, [src, #48]
555 strd C_l, C_h, [dst, #56]
556 ldrd C_l, C_h, [src, #56]
557 strd D_l, D_h, [dst, #64]!
558 ldrd D_l, D_h, [src, #64]!
559 subs tmp2, tmp2, #64
5601:
561 strd A_l, A_h, [dst, #8]
562 ldrd A_l, A_h, [src, #8]
563 strd B_l, B_h, [dst, #16]
564 ldrd B_l, B_h, [src, #16]
565 strd C_l, C_h, [dst, #24]
566 ldrd C_l, C_h, [src, #24]
567 strd D_l, D_h, [dst, #32]
568 ldrd D_l, D_h, [src, #32]
569 bcs 2b
570 /* Save the remaining bytes and restore the callee-saved regs. */
571 strd A_l, A_h, [dst, #40]
572 add src, src, #40
573 strd B_l, B_h, [dst, #48]
574 ldrd B_l, B_h, [sp, #8]
575 cfi_restore (B_l)
576 cfi_restore (B_h)
577 strd C_l, C_h, [dst, #56]
578 ldrd C_l, C_h, [sp, #16]
579 cfi_restore (C_l)
580 cfi_restore (C_h)
581 strd D_l, D_h, [dst, #64]
582 ldrd D_l, D_h, [sp, #24]
583 cfi_restore (D_l)
584 cfi_restore (D_h)
585 add dst, dst, #72
586 tst tmp2, #0x3f
587 bne .Ltail63aligned
588 ldr tmp2, [sp], #FRAME_SIZE
589 cfi_adjust_cfa_offset (-FRAME_SIZE)
590 cfi_restore (tmp2)
591 bx lr
592#endif
593
594 cfi_restore_state
595 cfi_remember_state
596
597.Lcpy_notaligned:
598 pld [src, #0]
599 pld [src, #64]
600 /* There's at least 64 bytes to copy, but there is no mutual
601 alignment. */
602 /* Bring DST to 64-bit alignment. */
603 lsls tmp2, dst, #29
604 pld [src, #(2 * 64)]
605 beq 1f
606 rsbs tmp2, tmp2, #0
607 sub count, count, tmp2, lsr #29
608 ldrmi tmp1, [src], #4
609 strmi tmp1, [dst], #4
610 lsls tmp2, tmp2, #2
611 ldrbne tmp1, [src], #1
612 ldrhcs tmp2, [src], #2
613 strbne tmp1, [dst], #1
614 strhcs tmp2, [dst], #2
6151:
616 pld [src, #(3 * 64)]
617 subs count, count, #64
618 ldrlo tmp2, [sp], #FRAME_SIZE
619 blo .Ltail63unaligned
620 pld [src, #(4 * 64)]
621
622#ifdef USE_NEON
623 /* These need an extra layer of macro just to work around a
624 bug in the assembler's parser when an operand starts with
625 a {...}. */
626 .macro neon_load_multi reglist, basereg
627 vld1.8 {\reglist}, [\basereg]!
628 .endm
629 .macro neon_store_multi reglist, basereg
630 vst1.8 {\reglist}, [ALIGN (\basereg, 64)]!
631 .endm
632
633 neon_load_multi d0-d3, src
634 neon_load_multi d4-d7, src
635 subs count, count, #64
636 blo 2f
6371:
638 pld [src, #(4 * 64)]
639 neon_store_multi d0-d3, dst
640 neon_load_multi d0-d3, src
641 neon_store_multi d4-d7, dst
642 neon_load_multi d4-d7, src
643 subs count, count, #64
644 bhs 1b
6452:
646 neon_store_multi d0-d3, dst
647 neon_store_multi d4-d7, dst
648 ands count, count, #0x3f
649#else
650 /* Use an SMS style loop to maximize the I/O bandwidth. */
651 sub src, src, #4
652 sub dst, dst, #8
653 subs tmp2, count, #64 /* Use tmp2 for count. */
654 ldr A_l, [src, #4]
655 ldr A_h, [src, #8]
656 strd B_l, B_h, [sp, #8]
657 cfi_rel_offset (B_l, 8)
658 cfi_rel_offset (B_h, 12)
659 ldr B_l, [src, #12]
660 ldr B_h, [src, #16]
661 strd C_l, C_h, [sp, #16]
662 cfi_rel_offset (C_l, 16)
663 cfi_rel_offset (C_h, 20)
664 ldr C_l, [src, #20]
665 ldr C_h, [src, #24]
666 strd D_l, D_h, [sp, #24]
667 cfi_rel_offset (D_l, 24)
668 cfi_rel_offset (D_h, 28)
669 ldr D_l, [src, #28]
670 ldr D_h, [src, #32]!
671 b 1f
672 .p2align 6
6732:
674 pld [src, #(5 * 64) - (32 - 4)]
675 strd A_l, A_h, [dst, #40]
676 ldr A_l, [src, #36]
677 ldr A_h, [src, #40]
678 strd B_l, B_h, [dst, #48]
679 ldr B_l, [src, #44]
680 ldr B_h, [src, #48]
681 strd C_l, C_h, [dst, #56]
682 ldr C_l, [src, #52]
683 ldr C_h, [src, #56]
684 strd D_l, D_h, [dst, #64]!
685 ldr D_l, [src, #60]
686 ldr D_h, [src, #64]!
687 subs tmp2, tmp2, #64
6881:
689 strd A_l, A_h, [dst, #8]
690 ldr A_l, [src, #4]
691 ldr A_h, [src, #8]
692 strd B_l, B_h, [dst, #16]
693 ldr B_l, [src, #12]
694 ldr B_h, [src, #16]
695 strd C_l, C_h, [dst, #24]
696 ldr C_l, [src, #20]
697 ldr C_h, [src, #24]
698 strd D_l, D_h, [dst, #32]
699 ldr D_l, [src, #28]
700 ldr D_h, [src, #32]
701 bcs 2b
702
703 /* Save the remaining bytes and restore the callee-saved regs. */
704 strd A_l, A_h, [dst, #40]
705 add src, src, #36
706 strd B_l, B_h, [dst, #48]
707 ldrd B_l, B_h, [sp, #8]
708 cfi_restore (B_l)
709 cfi_restore (B_h)
710 strd C_l, C_h, [dst, #56]
711 ldrd C_l, C_h, [sp, #16]
712 cfi_restore (C_l)
713 cfi_restore (C_h)
714 strd D_l, D_h, [dst, #64]
715 ldrd D_l, D_h, [sp, #24]
716 cfi_restore (D_l)
717 cfi_restore (D_h)
718 add dst, dst, #72
719 ands count, tmp2, #0x3f
720#endif
721 ldr tmp2, [sp], #FRAME_SIZE
722 cfi_adjust_cfa_offset (-FRAME_SIZE)
723 cfi_restore (tmp2)
724 bne .Ltail63unaligned
725 bx lr
726
727END(memcpy)
728libc_hidden_builtin_def (memcpy)
729

source code of glibc/sysdeps/arm/armv7/multiarch/memcpy_impl.S