1 | /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15. |
2 | Copyright (C) 2013-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. |
18 | |
19 | This memcpy routine is optimised for Cortex-A15 cores and takes advantage |
20 | of VFP or NEON when built with the appropriate flags. |
21 | |
22 | Assumptions: |
23 | |
24 | ARMv6 (ARMv7-a if using Neon) |
25 | ARM state |
26 | Unaligned accesses |
27 | |
28 | */ |
29 | |
30 | /* Thumb cannot encode negative immediate offsets in memory operations. */ |
31 | #ifndef NO_THUMB |
32 | #define NO_THUMB |
33 | #endif |
34 | #include <sysdep.h> |
35 | #include <arm-features.h> |
36 | |
37 | .syntax unified |
38 | /* This implementation requires ARM state. */ |
39 | .arm |
40 | |
41 | #ifdef MEMCPY_NEON |
42 | |
43 | .fpu neon |
44 | .arch armv7-a |
45 | # define FRAME_SIZE 4 |
46 | # define USE_VFP |
47 | # define USE_NEON |
48 | |
49 | #elif defined (MEMCPY_VFP) |
50 | |
51 | .arch armv6 |
52 | .fpu vfpv2 |
53 | # define FRAME_SIZE 32 |
54 | # define USE_VFP |
55 | |
56 | #else |
57 | .arch armv6 |
58 | # define FRAME_SIZE 32 |
59 | |
60 | #endif |
61 | |
62 | #define ALIGN(addr, align) addr:align |
63 | |
64 | #define INSN_SIZE 4 |
65 | |
66 | /* Call parameters. */ |
67 | #define dstin r0 |
68 | #define src r1 |
69 | #define count r2 |
70 | |
71 | /* Locals. */ |
72 | #define tmp1 r3 |
73 | #define dst ip |
74 | #define tmp2 r8 |
75 | |
76 | /* These two macros both work by repeated invocation of the macro |
77 | dispatch_step (not defined here). That macro performs one "step", |
78 | doing one load instruction and one store instruction to copy one |
79 | "unit". On entry, TMP1 contains the number of bytes to be copied, |
80 | a multiple of the unit size. The macro clobbers TMP1 in the |
81 | process of doing a computed jump to the tail containing the |
82 | appropriate number of steps. |
83 | |
84 | In dispatch_7_dword, dispatch_step is invoked seven times, with an |
85 | argument that is 7 for the first and 1 for the last. Units are |
86 | double-words (8 bytes). TMP1 is at most 56. |
87 | |
88 | In dispatch_15_word, dispatch_step is invoked fifteen times, |
89 | with an argument that is 15 for the first and 1 for the last. |
90 | Units are words (4 bytes). TMP1 is at most 60. */ |
91 | |
92 | #ifndef ARM_ALWAYS_BX |
93 | # if ARM_BX_ALIGN_LOG2 != 2 |
94 | # error case not handled |
95 | # endif |
96 | .macro dispatch_7_dword |
97 | rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE) |
98 | add pc, pc, tmp1 |
99 | dispatch_step 7 |
100 | dispatch_step 6 |
101 | dispatch_step 5 |
102 | dispatch_step 4 |
103 | dispatch_step 3 |
104 | dispatch_step 2 |
105 | dispatch_step 1 |
106 | .purgem dispatch_step |
107 | .endm |
108 | |
109 | .macro dispatch_15_word |
110 | rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2) |
111 | add pc, pc, tmp1, lsl #1 |
112 | dispatch_step 15 |
113 | dispatch_step 14 |
114 | dispatch_step 13 |
115 | dispatch_step 12 |
116 | dispatch_step 11 |
117 | dispatch_step 10 |
118 | dispatch_step 9 |
119 | dispatch_step 8 |
120 | dispatch_step 7 |
121 | dispatch_step 6 |
122 | dispatch_step 5 |
123 | dispatch_step 4 |
124 | dispatch_step 3 |
125 | dispatch_step 2 |
126 | dispatch_step 1 |
127 | .purgem dispatch_step |
128 | .endm |
129 | #else |
130 | # if ARM_BX_ALIGN_LOG2 < 3 |
131 | # error case not handled |
132 | # endif |
133 | .macro dispatch_helper steps, log2_bytes_per_step |
134 | /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is |
135 | (STEPS << LOG2_BYTES_PER_STEP). |
136 | So this is (steps_to_skip << LOG2_BYTES_PER_STEP). |
137 | Then it needs further adjustment to compensate for the |
138 | distance between the PC value taken below (0f + PC_OFS) |
139 | and the first step's instructions (1f). */ |
140 | rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \ |
141 | + ((1f - PC_OFS - 0f) \ |
142 | >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step))) |
143 | /* Shifting down LOG2_BYTES_PER_STEP gives us the number of |
144 | steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us |
145 | the (byte) distance to add to the PC. */ |
146 | 0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step) |
147 | bx tmp1 |
148 | .p2align ARM_BX_ALIGN_LOG2 |
149 | 1: |
150 | .endm |
151 | |
152 | .macro dispatch_7_dword |
153 | dispatch_helper 7, 3 |
154 | .p2align ARM_BX_ALIGN_LOG2 |
155 | dispatch_step 7 |
156 | .p2align ARM_BX_ALIGN_LOG2 |
157 | dispatch_step 6 |
158 | .p2align ARM_BX_ALIGN_LOG2 |
159 | dispatch_step 5 |
160 | .p2align ARM_BX_ALIGN_LOG2 |
161 | dispatch_step 4 |
162 | .p2align ARM_BX_ALIGN_LOG2 |
163 | dispatch_step 3 |
164 | .p2align ARM_BX_ALIGN_LOG2 |
165 | dispatch_step 2 |
166 | .p2align ARM_BX_ALIGN_LOG2 |
167 | dispatch_step 1 |
168 | .p2align ARM_BX_ALIGN_LOG2 |
169 | .purgem dispatch_step |
170 | .endm |
171 | |
172 | .macro dispatch_15_word |
173 | dispatch_helper 15, 2 |
174 | dispatch_step 15 |
175 | .p2align ARM_BX_ALIGN_LOG2 |
176 | dispatch_step 14 |
177 | .p2align ARM_BX_ALIGN_LOG2 |
178 | dispatch_step 13 |
179 | .p2align ARM_BX_ALIGN_LOG2 |
180 | dispatch_step 12 |
181 | .p2align ARM_BX_ALIGN_LOG2 |
182 | dispatch_step 11 |
183 | .p2align ARM_BX_ALIGN_LOG2 |
184 | dispatch_step 10 |
185 | .p2align ARM_BX_ALIGN_LOG2 |
186 | dispatch_step 9 |
187 | .p2align ARM_BX_ALIGN_LOG2 |
188 | dispatch_step 8 |
189 | .p2align ARM_BX_ALIGN_LOG2 |
190 | dispatch_step 7 |
191 | .p2align ARM_BX_ALIGN_LOG2 |
192 | dispatch_step 6 |
193 | .p2align ARM_BX_ALIGN_LOG2 |
194 | dispatch_step 5 |
195 | .p2align ARM_BX_ALIGN_LOG2 |
196 | dispatch_step 4 |
197 | .p2align ARM_BX_ALIGN_LOG2 |
198 | dispatch_step 3 |
199 | .p2align ARM_BX_ALIGN_LOG2 |
200 | dispatch_step 2 |
201 | .p2align ARM_BX_ALIGN_LOG2 |
202 | dispatch_step 1 |
203 | .p2align ARM_BX_ALIGN_LOG2 |
204 | .purgem dispatch_step |
205 | .endm |
206 | |
207 | #endif |
208 | |
209 | #ifndef USE_NEON |
210 | /* For bulk copies using GP registers. */ |
211 | #define A_l r2 /* Call-clobbered. */ |
212 | #define A_h r3 /* Call-clobbered. */ |
213 | #define B_l r4 |
214 | #define B_h r5 |
215 | #define C_l r6 |
216 | #define C_h r7 |
217 | /* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */ |
218 | #define D_l r10 |
219 | #define D_h r11 |
220 | #endif |
221 | |
222 | /* Number of lines ahead to pre-fetch data. If you change this the code |
223 | below will need adjustment to compensate. */ |
224 | |
225 | #define prefetch_lines 5 |
226 | |
227 | #ifdef USE_VFP |
228 | .macro cpy_line_vfp vreg, base |
229 | vstr \vreg, [dst, #\base] |
230 | vldr \vreg, [src, #\base] |
231 | vstr d0, [dst, #\base + 8] |
232 | vldr d0, [src, #\base + 8] |
233 | vstr d1, [dst, #\base + 16] |
234 | vldr d1, [src, #\base + 16] |
235 | vstr d2, [dst, #\base + 24] |
236 | vldr d2, [src, #\base + 24] |
237 | vstr \vreg, [dst, #\base + 32] |
238 | vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] |
239 | vstr d0, [dst, #\base + 40] |
240 | vldr d0, [src, #\base + 40] |
241 | vstr d1, [dst, #\base + 48] |
242 | vldr d1, [src, #\base + 48] |
243 | vstr d2, [dst, #\base + 56] |
244 | vldr d2, [src, #\base + 56] |
245 | .endm |
246 | |
247 | .macro cpy_tail_vfp vreg, base |
248 | vstr \vreg, [dst, #\base] |
249 | vldr \vreg, [src, #\base] |
250 | vstr d0, [dst, #\base + 8] |
251 | vldr d0, [src, #\base + 8] |
252 | vstr d1, [dst, #\base + 16] |
253 | vldr d1, [src, #\base + 16] |
254 | vstr d2, [dst, #\base + 24] |
255 | vldr d2, [src, #\base + 24] |
256 | vstr \vreg, [dst, #\base + 32] |
257 | vstr d0, [dst, #\base + 40] |
258 | vldr d0, [src, #\base + 40] |
259 | vstr d1, [dst, #\base + 48] |
260 | vldr d1, [src, #\base + 48] |
261 | vstr d2, [dst, #\base + 56] |
262 | vldr d2, [src, #\base + 56] |
263 | .endm |
264 | #endif |
265 | |
266 | .p2align 6 |
267 | ENTRY(memcpy) |
268 | |
269 | mov dst, dstin /* Preserve dstin, we need to return it. */ |
270 | cmp count, #64 |
271 | bhs .Lcpy_not_short |
272 | /* Deal with small copies quickly by dropping straight into the |
273 | exit block. */ |
274 | |
275 | .Ltail63unaligned: |
276 | #ifdef USE_NEON |
277 | /* These need an extra layer of macro just to work around a |
278 | bug in the assembler's parser when an operand starts with |
279 | a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647 |
280 | tracks that bug; it was not fixed as of binutils-2.23.2. */ |
281 | .macro neon_load_d0 reg |
282 | vld1.8 {d0}, [\reg]! |
283 | .endm |
284 | .macro neon_store_d0 reg |
285 | vst1.8 {d0}, [\reg]! |
286 | .endm |
287 | |
288 | and tmp1, count, #0x38 |
289 | .macro dispatch_step i |
290 | neon_load_d0 src |
291 | neon_store_d0 dst |
292 | .endm |
293 | dispatch_7_dword |
294 | |
295 | tst count, #4 |
296 | ldrne tmp1, [src], #4 |
297 | strne tmp1, [dst], #4 |
298 | #else |
299 | /* Copy up to 15 full words of data. May not be aligned. */ |
300 | /* Cannot use VFP for unaligned data. */ |
301 | and tmp1, count, #0x3c |
302 | add dst, dst, tmp1 |
303 | add src, src, tmp1 |
304 | /* Jump directly into the sequence below at the correct offset. */ |
305 | .macro dispatch_step i |
306 | ldr tmp1, [src, #-(\i * 4)] |
307 | str tmp1, [dst, #-(\i * 4)] |
308 | .endm |
309 | dispatch_15_word |
310 | #endif |
311 | |
312 | lsls count, count, #31 |
313 | ldrhcs tmp1, [src], #2 |
314 | ldrbne src, [src] /* Src is dead, use as a scratch. */ |
315 | strhcs tmp1, [dst], #2 |
316 | strbne src, [dst] |
317 | bx lr |
318 | |
319 | .Lcpy_not_short: |
320 | /* At least 64 bytes to copy, but don't know the alignment yet. */ |
321 | str tmp2, [sp, #-FRAME_SIZE]! |
322 | cfi_adjust_cfa_offset (FRAME_SIZE) |
323 | cfi_rel_offset (tmp2, 0) |
324 | cfi_remember_state |
325 | and tmp2, src, #7 |
326 | and tmp1, dst, #7 |
327 | cmp tmp1, tmp2 |
328 | bne .Lcpy_notaligned |
329 | |
330 | #ifdef USE_VFP |
331 | /* Magic dust alert! Force VFP on Cortex-A9. Experiments show |
332 | that the FP pipeline is much better at streaming loads and |
333 | stores. This is outside the critical loop. */ |
334 | vmov.f32 s0, s0 |
335 | #endif |
336 | |
337 | /* SRC and DST have the same mutual 64-bit alignment, but we may |
338 | still need to pre-copy some bytes to get to natural alignment. |
339 | We bring SRC and DST into full 64-bit alignment. */ |
340 | lsls tmp2, dst, #29 |
341 | beq 1f |
342 | rsbs tmp2, tmp2, #0 |
343 | sub count, count, tmp2, lsr #29 |
344 | ldrmi tmp1, [src], #4 |
345 | strmi tmp1, [dst], #4 |
346 | lsls tmp2, tmp2, #2 |
347 | ldrhcs tmp1, [src], #2 |
348 | ldrbne tmp2, [src], #1 |
349 | strhcs tmp1, [dst], #2 |
350 | strbne tmp2, [dst], #1 |
351 | |
352 | 1: |
353 | subs tmp2, count, #64 /* Use tmp2 for count. */ |
354 | blo .Ltail63aligned |
355 | |
356 | cmp tmp2, #512 |
357 | bhs .Lcpy_body_long |
358 | |
359 | .Lcpy_body_medium: /* Count in tmp2. */ |
360 | #ifdef USE_VFP |
361 | 1: |
362 | vldr d0, [src, #0] |
363 | subs tmp2, tmp2, #64 |
364 | vldr d1, [src, #8] |
365 | vstr d0, [dst, #0] |
366 | vldr d0, [src, #16] |
367 | vstr d1, [dst, #8] |
368 | vldr d1, [src, #24] |
369 | vstr d0, [dst, #16] |
370 | vldr d0, [src, #32] |
371 | vstr d1, [dst, #24] |
372 | vldr d1, [src, #40] |
373 | vstr d0, [dst, #32] |
374 | vldr d0, [src, #48] |
375 | vstr d1, [dst, #40] |
376 | vldr d1, [src, #56] |
377 | vstr d0, [dst, #48] |
378 | add src, src, #64 |
379 | vstr d1, [dst, #56] |
380 | add dst, dst, #64 |
381 | bhs 1b |
382 | tst tmp2, #0x3f |
383 | beq .Ldone |
384 | |
385 | .Ltail63aligned: /* Count in tmp2. */ |
386 | and tmp1, tmp2, #0x38 |
387 | add dst, dst, tmp1 |
388 | add src, src, tmp1 |
389 | .macro dispatch_step i |
390 | vldr d0, [src, #-(\i * 8)] |
391 | vstr d0, [dst, #-(\i * 8)] |
392 | .endm |
393 | dispatch_7_dword |
394 | #else |
395 | sub src, src, #8 |
396 | sub dst, dst, #8 |
397 | 1: |
398 | ldrd A_l, A_h, [src, #8] |
399 | strd A_l, A_h, [dst, #8] |
400 | ldrd A_l, A_h, [src, #16] |
401 | strd A_l, A_h, [dst, #16] |
402 | ldrd A_l, A_h, [src, #24] |
403 | strd A_l, A_h, [dst, #24] |
404 | ldrd A_l, A_h, [src, #32] |
405 | strd A_l, A_h, [dst, #32] |
406 | ldrd A_l, A_h, [src, #40] |
407 | strd A_l, A_h, [dst, #40] |
408 | ldrd A_l, A_h, [src, #48] |
409 | strd A_l, A_h, [dst, #48] |
410 | ldrd A_l, A_h, [src, #56] |
411 | strd A_l, A_h, [dst, #56] |
412 | ldrd A_l, A_h, [src, #64]! |
413 | strd A_l, A_h, [dst, #64]! |
414 | subs tmp2, tmp2, #64 |
415 | bhs 1b |
416 | tst tmp2, #0x3f |
417 | bne 1f |
418 | ldr tmp2,[sp], #FRAME_SIZE |
419 | cfi_adjust_cfa_offset (-FRAME_SIZE) |
420 | cfi_restore (tmp2) |
421 | bx lr |
422 | |
423 | cfi_restore_state |
424 | cfi_remember_state |
425 | 1: |
426 | add src, src, #8 |
427 | add dst, dst, #8 |
428 | |
429 | .Ltail63aligned: /* Count in tmp2. */ |
430 | /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but |
431 | we know that the src and dest are 64-bit aligned so we can use |
432 | LDRD/STRD to improve efficiency. */ |
433 | /* TMP2 is now negative, but we don't care about that. The bottom |
434 | six bits still tell us how many bytes are left to copy. */ |
435 | |
436 | and tmp1, tmp2, #0x38 |
437 | add dst, dst, tmp1 |
438 | add src, src, tmp1 |
439 | .macro dispatch_step i |
440 | ldrd A_l, A_h, [src, #-(\i * 8)] |
441 | strd A_l, A_h, [dst, #-(\i * 8)] |
442 | .endm |
443 | dispatch_7_dword |
444 | #endif |
445 | |
446 | tst tmp2, #4 |
447 | ldrne tmp1, [src], #4 |
448 | strne tmp1, [dst], #4 |
449 | lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ |
450 | ldrhcs tmp1, [src], #2 |
451 | ldrbne tmp2, [src] |
452 | strhcs tmp1, [dst], #2 |
453 | strbne tmp2, [dst] |
454 | |
455 | .Ldone: |
456 | ldr tmp2, [sp], #FRAME_SIZE |
457 | cfi_adjust_cfa_offset (-FRAME_SIZE) |
458 | cfi_restore (tmp2) |
459 | bx lr |
460 | |
461 | cfi_restore_state |
462 | cfi_remember_state |
463 | |
464 | .Lcpy_body_long: /* Count in tmp2. */ |
465 | |
466 | /* Long copy. We know that there's at least (prefetch_lines * 64) |
467 | bytes to go. */ |
468 | #ifdef USE_VFP |
469 | /* Don't use PLD. Instead, read some data in advance of the current |
470 | copy position into a register. This should act like a PLD |
471 | operation but we won't have to repeat the transfer. */ |
472 | |
473 | vldr d3, [src, #0] |
474 | vldr d4, [src, #64] |
475 | vldr d5, [src, #128] |
476 | vldr d6, [src, #192] |
477 | vldr d7, [src, #256] |
478 | |
479 | vldr d0, [src, #8] |
480 | vldr d1, [src, #16] |
481 | vldr d2, [src, #24] |
482 | add src, src, #32 |
483 | |
484 | subs tmp2, tmp2, #prefetch_lines * 64 * 2 |
485 | blo 2f |
486 | 1: |
487 | cpy_line_vfp d3, 0 |
488 | cpy_line_vfp d4, 64 |
489 | cpy_line_vfp d5, 128 |
490 | add dst, dst, #3 * 64 |
491 | add src, src, #3 * 64 |
492 | cpy_line_vfp d6, 0 |
493 | cpy_line_vfp d7, 64 |
494 | add dst, dst, #2 * 64 |
495 | add src, src, #2 * 64 |
496 | subs tmp2, tmp2, #prefetch_lines * 64 |
497 | bhs 1b |
498 | |
499 | 2: |
500 | cpy_tail_vfp d3, 0 |
501 | cpy_tail_vfp d4, 64 |
502 | cpy_tail_vfp d5, 128 |
503 | add src, src, #3 * 64 |
504 | add dst, dst, #3 * 64 |
505 | cpy_tail_vfp d6, 0 |
506 | vstr d7, [dst, #64] |
507 | vldr d7, [src, #64] |
508 | vstr d0, [dst, #64 + 8] |
509 | vldr d0, [src, #64 + 8] |
510 | vstr d1, [dst, #64 + 16] |
511 | vldr d1, [src, #64 + 16] |
512 | vstr d2, [dst, #64 + 24] |
513 | vldr d2, [src, #64 + 24] |
514 | vstr d7, [dst, #64 + 32] |
515 | add src, src, #96 |
516 | vstr d0, [dst, #64 + 40] |
517 | vstr d1, [dst, #64 + 48] |
518 | vstr d2, [dst, #64 + 56] |
519 | add dst, dst, #128 |
520 | add tmp2, tmp2, #prefetch_lines * 64 |
521 | b .Lcpy_body_medium |
522 | #else |
523 | /* Long copy. Use an SMS style loop to maximize the I/O |
524 | bandwidth of the core. We don't have enough spare registers |
525 | to synthesise prefetching, so use PLD operations. */ |
526 | /* Pre-bias src and dst. */ |
527 | sub src, src, #8 |
528 | sub dst, dst, #8 |
529 | pld [src, #8] |
530 | pld [src, #72] |
531 | subs tmp2, tmp2, #64 |
532 | pld [src, #136] |
533 | ldrd A_l, A_h, [src, #8] |
534 | strd B_l, B_h, [sp, #8] |
535 | cfi_rel_offset (B_l, 8) |
536 | cfi_rel_offset (B_h, 12) |
537 | ldrd B_l, B_h, [src, #16] |
538 | strd C_l, C_h, [sp, #16] |
539 | cfi_rel_offset (C_l, 16) |
540 | cfi_rel_offset (C_h, 20) |
541 | ldrd C_l, C_h, [src, #24] |
542 | strd D_l, D_h, [sp, #24] |
543 | cfi_rel_offset (D_l, 24) |
544 | cfi_rel_offset (D_h, 28) |
545 | pld [src, #200] |
546 | ldrd D_l, D_h, [src, #32]! |
547 | b 1f |
548 | .p2align 6 |
549 | 2: |
550 | pld [src, #232] |
551 | strd A_l, A_h, [dst, #40] |
552 | ldrd A_l, A_h, [src, #40] |
553 | strd B_l, B_h, [dst, #48] |
554 | ldrd B_l, B_h, [src, #48] |
555 | strd C_l, C_h, [dst, #56] |
556 | ldrd C_l, C_h, [src, #56] |
557 | strd D_l, D_h, [dst, #64]! |
558 | ldrd D_l, D_h, [src, #64]! |
559 | subs tmp2, tmp2, #64 |
560 | 1: |
561 | strd A_l, A_h, [dst, #8] |
562 | ldrd A_l, A_h, [src, #8] |
563 | strd B_l, B_h, [dst, #16] |
564 | ldrd B_l, B_h, [src, #16] |
565 | strd C_l, C_h, [dst, #24] |
566 | ldrd C_l, C_h, [src, #24] |
567 | strd D_l, D_h, [dst, #32] |
568 | ldrd D_l, D_h, [src, #32] |
569 | bcs 2b |
570 | /* Save the remaining bytes and restore the callee-saved regs. */ |
571 | strd A_l, A_h, [dst, #40] |
572 | add src, src, #40 |
573 | strd B_l, B_h, [dst, #48] |
574 | ldrd B_l, B_h, [sp, #8] |
575 | cfi_restore (B_l) |
576 | cfi_restore (B_h) |
577 | strd C_l, C_h, [dst, #56] |
578 | ldrd C_l, C_h, [sp, #16] |
579 | cfi_restore (C_l) |
580 | cfi_restore (C_h) |
581 | strd D_l, D_h, [dst, #64] |
582 | ldrd D_l, D_h, [sp, #24] |
583 | cfi_restore (D_l) |
584 | cfi_restore (D_h) |
585 | add dst, dst, #72 |
586 | tst tmp2, #0x3f |
587 | bne .Ltail63aligned |
588 | ldr tmp2, [sp], #FRAME_SIZE |
589 | cfi_adjust_cfa_offset (-FRAME_SIZE) |
590 | cfi_restore (tmp2) |
591 | bx lr |
592 | #endif |
593 | |
594 | cfi_restore_state |
595 | cfi_remember_state |
596 | |
597 | .Lcpy_notaligned: |
598 | pld [src, #0] |
599 | pld [src, #64] |
600 | /* There's at least 64 bytes to copy, but there is no mutual |
601 | alignment. */ |
602 | /* Bring DST to 64-bit alignment. */ |
603 | lsls tmp2, dst, #29 |
604 | pld [src, #(2 * 64)] |
605 | beq 1f |
606 | rsbs tmp2, tmp2, #0 |
607 | sub count, count, tmp2, lsr #29 |
608 | ldrmi tmp1, [src], #4 |
609 | strmi tmp1, [dst], #4 |
610 | lsls tmp2, tmp2, #2 |
611 | ldrbne tmp1, [src], #1 |
612 | ldrhcs tmp2, [src], #2 |
613 | strbne tmp1, [dst], #1 |
614 | strhcs tmp2, [dst], #2 |
615 | 1: |
616 | pld [src, #(3 * 64)] |
617 | subs count, count, #64 |
618 | ldrlo tmp2, [sp], #FRAME_SIZE |
619 | blo .Ltail63unaligned |
620 | pld [src, #(4 * 64)] |
621 | |
622 | #ifdef USE_NEON |
623 | /* These need an extra layer of macro just to work around a |
624 | bug in the assembler's parser when an operand starts with |
625 | a {...}. */ |
626 | .macro neon_load_multi reglist, basereg |
627 | vld1.8 {\reglist}, [\basereg]! |
628 | .endm |
629 | .macro neon_store_multi reglist, basereg |
630 | vst1.8 {\reglist}, [ALIGN (\basereg, 64)]! |
631 | .endm |
632 | |
633 | neon_load_multi d0-d3, src |
634 | neon_load_multi d4-d7, src |
635 | subs count, count, #64 |
636 | blo 2f |
637 | 1: |
638 | pld [src, #(4 * 64)] |
639 | neon_store_multi d0-d3, dst |
640 | neon_load_multi d0-d3, src |
641 | neon_store_multi d4-d7, dst |
642 | neon_load_multi d4-d7, src |
643 | subs count, count, #64 |
644 | bhs 1b |
645 | 2: |
646 | neon_store_multi d0-d3, dst |
647 | neon_store_multi d4-d7, dst |
648 | ands count, count, #0x3f |
649 | #else |
650 | /* Use an SMS style loop to maximize the I/O bandwidth. */ |
651 | sub src, src, #4 |
652 | sub dst, dst, #8 |
653 | subs tmp2, count, #64 /* Use tmp2 for count. */ |
654 | ldr A_l, [src, #4] |
655 | ldr A_h, [src, #8] |
656 | strd B_l, B_h, [sp, #8] |
657 | cfi_rel_offset (B_l, 8) |
658 | cfi_rel_offset (B_h, 12) |
659 | ldr B_l, [src, #12] |
660 | ldr B_h, [src, #16] |
661 | strd C_l, C_h, [sp, #16] |
662 | cfi_rel_offset (C_l, 16) |
663 | cfi_rel_offset (C_h, 20) |
664 | ldr C_l, [src, #20] |
665 | ldr C_h, [src, #24] |
666 | strd D_l, D_h, [sp, #24] |
667 | cfi_rel_offset (D_l, 24) |
668 | cfi_rel_offset (D_h, 28) |
669 | ldr D_l, [src, #28] |
670 | ldr D_h, [src, #32]! |
671 | b 1f |
672 | .p2align 6 |
673 | 2: |
674 | pld [src, #(5 * 64) - (32 - 4)] |
675 | strd A_l, A_h, [dst, #40] |
676 | ldr A_l, [src, #36] |
677 | ldr A_h, [src, #40] |
678 | strd B_l, B_h, [dst, #48] |
679 | ldr B_l, [src, #44] |
680 | ldr B_h, [src, #48] |
681 | strd C_l, C_h, [dst, #56] |
682 | ldr C_l, [src, #52] |
683 | ldr C_h, [src, #56] |
684 | strd D_l, D_h, [dst, #64]! |
685 | ldr D_l, [src, #60] |
686 | ldr D_h, [src, #64]! |
687 | subs tmp2, tmp2, #64 |
688 | 1: |
689 | strd A_l, A_h, [dst, #8] |
690 | ldr A_l, [src, #4] |
691 | ldr A_h, [src, #8] |
692 | strd B_l, B_h, [dst, #16] |
693 | ldr B_l, [src, #12] |
694 | ldr B_h, [src, #16] |
695 | strd C_l, C_h, [dst, #24] |
696 | ldr C_l, [src, #20] |
697 | ldr C_h, [src, #24] |
698 | strd D_l, D_h, [dst, #32] |
699 | ldr D_l, [src, #28] |
700 | ldr D_h, [src, #32] |
701 | bcs 2b |
702 | |
703 | /* Save the remaining bytes and restore the callee-saved regs. */ |
704 | strd A_l, A_h, [dst, #40] |
705 | add src, src, #36 |
706 | strd B_l, B_h, [dst, #48] |
707 | ldrd B_l, B_h, [sp, #8] |
708 | cfi_restore (B_l) |
709 | cfi_restore (B_h) |
710 | strd C_l, C_h, [dst, #56] |
711 | ldrd C_l, C_h, [sp, #16] |
712 | cfi_restore (C_l) |
713 | cfi_restore (C_h) |
714 | strd D_l, D_h, [dst, #64] |
715 | ldrd D_l, D_h, [sp, #24] |
716 | cfi_restore (D_l) |
717 | cfi_restore (D_h) |
718 | add dst, dst, #72 |
719 | ands count, tmp2, #0x3f |
720 | #endif |
721 | ldr tmp2, [sp], #FRAME_SIZE |
722 | cfi_adjust_cfa_offset (-FRAME_SIZE) |
723 | cfi_restore (tmp2) |
724 | bne .Ltail63unaligned |
725 | bx lr |
726 | |
727 | END(memcpy) |
728 | libc_hidden_builtin_def (memcpy) |
729 | |