1/* A Thunderx2 Optimized memcpy implementation for AARCH64.
2 Copyright (C) 2018-2022 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22/* Assumptions:
23 *
24 * ARMv8-a, AArch64, unaligned accesses.
25 *
26 */
27
28#define dstin x0
29#define src x1
30#define count x2
31#define dst x3
32#define srcend x4
33#define dstend x5
34#define tmp2 x6
35#define tmp3 x7
36#define tmp3w w7
37#define A_l x6
38#define A_lw w6
39#define A_h x7
40#define A_hw w7
41#define B_l x8
42#define B_lw w8
43#define B_h x9
44#define C_l x10
45#define C_h x11
46#define D_l x12
47#define D_h x13
48#define E_l src
49#define E_h count
50#define F_l srcend
51#define F_h dst
52#define G_l count
53#define G_h dst
54#define tmp1 x14
55
56#define A_q q0
57#define B_q q1
58#define C_q q2
59#define D_q q3
60#define E_q q4
61#define F_q q5
62#define G_q q6
63#define H_q q7
64#define I_q q16
65#define J_q q17
66
67#define A_v v0
68#define B_v v1
69#define C_v v2
70#define D_v v3
71#define E_v v4
72#define F_v v5
73#define G_v v6
74#define H_v v7
75#define I_v v16
76#define J_v v17
77
78#ifndef MEMMOVE
79# define MEMMOVE memmove
80#endif
81#ifndef MEMCPY
82# define MEMCPY memcpy
83#endif
84
85#if IS_IN (libc)
86
87#undef MEMCPY
88#define MEMCPY __memcpy_thunderx2
89#undef MEMMOVE
90#define MEMMOVE __memmove_thunderx2
91
92
93/* Overlapping large forward memmoves use a loop that copies backwards.
94 Otherwise memcpy is used. Small moves branch to memcopy16 directly.
95 The longer memcpy cases fall through to the memcpy head.
96*/
97
98ENTRY_ALIGN (MEMMOVE, 6)
99
100 PTR_ARG (0)
101 PTR_ARG (1)
102 SIZE_ARG (2)
103
104 add srcend, src, count
105 cmp count, 16
106 b.ls L(memcopy16)
107 sub tmp1, dstin, src
108 cmp count, 96
109 ccmp tmp1, count, 2, hi
110 b.lo L(move_long)
111
112END (MEMMOVE)
113libc_hidden_builtin_def (MEMMOVE)
114
115
116/* Copies are split into 3 main cases: small copies of up to 16 bytes,
117 medium copies of 17..96 bytes which are fully unrolled. Large copies
118 of more than 96 bytes align the destination and use load-and-merge
119 approach in the case src and dst addresses are unaligned not evenly,
120 so that, actual loads and stores are always aligned.
121 Large copies use the loops processing 64 bytes per iteration for
122 unaligned case and 128 bytes per iteration for aligned ones.
123*/
124
125#define MEMCPY_PREFETCH_LDR 640
126
127 .p2align 4
128ENTRY (MEMCPY)
129
130 PTR_ARG (0)
131 PTR_ARG (1)
132 SIZE_ARG (2)
133
134 add srcend, src, count
135 cmp count, 16
136 b.ls L(memcopy16)
137 ldr A_q, [src], #16
138 add dstend, dstin, count
139 and tmp1, src, 15
140 cmp count, 96
141 b.hi L(memcopy_long)
142
143 /* Medium copies: 17..96 bytes. */
144 ldr E_q, [srcend, -16]
145 cmp count, 64
146 b.gt L(memcpy_copy96)
147 cmp count, 48
148 b.le L(bytes_17_to_48)
149 /* 49..64 bytes */
150 ldp B_q, C_q, [src]
151 str E_q, [dstend, -16]
152 stp A_q, B_q, [dstin]
153 str C_q, [dstin, 32]
154 ret
155
156L(bytes_17_to_48):
157 /* 17..48 bytes*/
158 cmp count, 32
159 b.gt L(bytes_32_to_48)
160 /* 17..32 bytes*/
161 str A_q, [dstin]
162 str E_q, [dstend, -16]
163 ret
164
165L(bytes_32_to_48):
166 /* 32..48 */
167 ldr B_q, [src]
168 str A_q, [dstin]
169 str E_q, [dstend, -16]
170 str B_q, [dstin, 16]
171 ret
172
173 .p2align 4
174 /* Small copies: 0..16 bytes. */
175L(memcopy16):
176 cmp count, 8
177 b.lo L(bytes_0_to_8)
178 ldr A_l, [src]
179 ldr A_h, [srcend, -8]
180 add dstend, dstin, count
181 str A_l, [dstin]
182 str A_h, [dstend, -8]
183 ret
184 .p2align 4
185
186L(bytes_0_to_8):
187 tbz count, 2, L(bytes_0_to_3)
188 ldr A_lw, [src]
189 ldr A_hw, [srcend, -4]
190 add dstend, dstin, count
191 str A_lw, [dstin]
192 str A_hw, [dstend, -4]
193 ret
194
195 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
196 byte 3 times if count==1, or the 2nd byte twice if count==2. */
197L(bytes_0_to_3):
198 cbz count, 1f
199 lsr tmp1, count, 1
200 ldrb A_lw, [src]
201 ldrb A_hw, [srcend, -1]
202 add dstend, dstin, count
203 ldrb B_lw, [src, tmp1]
204 strb B_lw, [dstin, tmp1]
205 strb A_hw, [dstend, -1]
206 strb A_lw, [dstin]
2071:
208 ret
209
210 .p2align 4
211
212L(memcpy_copy96):
213 /* Copying 65..96 bytes. A_q (first 16 bytes) and
214 E_q(last 16 bytes) are already loaded. The size
215 is large enough to benefit from aligned loads */
216 bic src, src, 15
217 ldp B_q, C_q, [src]
218 /* Loaded 64 bytes, second 16-bytes chunk can be
219 overlapping with the first chunk by tmp1 bytes.
220 Stored 16 bytes. */
221 sub dst, dstin, tmp1
222 add count, count, tmp1
223 /* The range of count being [65..96] becomes [65..111]
224 after tmp [0..15] gets added to it,
225 count now is <bytes-left-to-load>+48 */
226 cmp count, 80
227 b.gt L(copy96_medium)
228 ldr D_q, [src, 32]
229 stp B_q, C_q, [dst, 16]
230 str D_q, [dst, 48]
231 str A_q, [dstin]
232 str E_q, [dstend, -16]
233 ret
234
235 .p2align 4
236L(copy96_medium):
237 ldp D_q, G_q, [src, 32]
238 cmp count, 96
239 b.gt L(copy96_large)
240 stp B_q, C_q, [dst, 16]
241 stp D_q, G_q, [dst, 48]
242 str A_q, [dstin]
243 str E_q, [dstend, -16]
244 ret
245
246L(copy96_large):
247 ldr F_q, [src, 64]
248 str B_q, [dst, 16]
249 stp C_q, D_q, [dst, 32]
250 stp G_q, F_q, [dst, 64]
251 str A_q, [dstin]
252 str E_q, [dstend, -16]
253 ret
254
255 .p2align 4
256L(memcopy_long):
257 bic src, src, 15
258 ldp B_q, C_q, [src], #32
259 sub dst, dstin, tmp1
260 add count, count, tmp1
261 add dst, dst, 16
262 and tmp1, dst, 15
263 ldp D_q, E_q, [src], #32
264 str A_q, [dstin]
265
266 /* Already loaded 64+16 bytes. Check if at
267 least 64 more bytes left */
268 subs count, count, 64+64+16
269 b.lt L(loop128_exit0)
270 cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
271 b.lt L(loop128)
272 cbnz tmp1, L(dst_unaligned)
273 sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
274
275 .p2align 4
276
277L(loop128_prefetch):
278 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
279 ldp F_q, G_q, [src], #32
280 stp B_q, C_q, [dst], #32
281 ldp H_q, I_q, [src], #32
282 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
283 ldp B_q, C_q, [src], #32
284 stp D_q, E_q, [dst], #32
285 ldp D_q, E_q, [src], #32
286 stp F_q, G_q, [dst], #32
287 stp H_q, I_q, [dst], #32
288 subs count, count, 128
289 b.ge L(loop128_prefetch)
290
291 add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
292 .p2align 4
293L(loop128):
294 ldp F_q, G_q, [src], #32
295 ldp H_q, I_q, [src], #32
296 stp B_q, C_q, [dst], #32
297 stp D_q, E_q, [dst], #32
298 subs count, count, 64
299 b.lt L(loop128_exit1)
300 ldp B_q, C_q, [src], #32
301 ldp D_q, E_q, [src], #32
302 stp F_q, G_q, [dst], #32
303 stp H_q, I_q, [dst], #32
304 subs count, count, 64
305 b.ge L(loop128)
306L(loop128_exit0):
307 ldp F_q, G_q, [srcend, -64]
308 ldp H_q, I_q, [srcend, -32]
309 stp B_q, C_q, [dst], #32
310 stp D_q, E_q, [dst]
311 stp F_q, G_q, [dstend, -64]
312 stp H_q, I_q, [dstend, -32]
313 ret
314L(loop128_exit1):
315 ldp B_q, C_q, [srcend, -64]
316 ldp D_q, E_q, [srcend, -32]
317 stp F_q, G_q, [dst], #32
318 stp H_q, I_q, [dst]
319 stp B_q, C_q, [dstend, -64]
320 stp D_q, E_q, [dstend, -32]
321 ret
322
323L(dst_unaligned_tail):
324 ldp C_q, D_q, [srcend, -64]
325 ldp E_q, F_q, [srcend, -32]
326 stp A_q, B_q, [dst], #32
327 stp H_q, I_q, [dst], #16
328 str G_q, [dst, tmp1]
329 stp C_q, D_q, [dstend, -64]
330 stp E_q, F_q, [dstend, -32]
331 ret
332
333L(dst_unaligned):
334 /* For the unaligned store case the code loads two
335 aligned chunks and then merges them using ext
336 instruction. This can be up to 30% faster than
337 the the simple unaligned store access.
338
339 Current state: tmp1 = dst % 16; C_q, D_q, E_q
340 contains data yet to be stored. src and dst points
341 to next-to-be-processed data. A_q, B_q contains
342 data already stored before, count = bytes left to
343 be load decremented by 64.
344
345 The control is passed here if at least 64 bytes left
346 to be loaded. The code does two aligned loads and then
347 extracts (16-tmp1) bytes from the first register and
348 tmp1 bytes from the next register forming the value
349 for the aligned store.
350
351 As ext instruction can only have it's index encoded
352 as immediate. 15 code chunks process each possible
353 index value. Computed goto is used to reach the
354 required code. */
355
356 /* Store the 16 bytes to dst and align dst for further
357 operations, several bytes will be stored at this
358 address once more */
359
360 ldp F_q, G_q, [src], #32
361 stp B_q, C_q, [dst], #32
362 bic dst, dst, 15
363 sub count, count, 32
364 adrp tmp2, L(ext_table)
365 add tmp2, tmp2, :lo12:L(ext_table)
366 add tmp2, tmp2, tmp1, LSL #2
367 ldr tmp3w, [tmp2]
368 add tmp2, tmp2, tmp3w, SXTW
369 br tmp2
370
371.p2align 4
372 /* to make the loop in each chunk 16-bytes aligned */
373 nop
374#define EXT_CHUNK(shft) \
375L(ext_size_ ## shft):;\
376 ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\
377 ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\
378 ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\
3791:;\
380 stp A_q, B_q, [dst], #32;\
381 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
382 ldp C_q, D_q, [src], #32;\
383 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
384 stp H_q, I_q, [dst], #32;\
385 ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\
386 ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\
387 ldp F_q, G_q, [src], #32;\
388 ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\
389 subs count, count, 64;\
390 b.ge 1b;\
3912:;\
392 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
393 b L(dst_unaligned_tail);
394
395EXT_CHUNK(1)
396EXT_CHUNK(2)
397EXT_CHUNK(3)
398EXT_CHUNK(4)
399EXT_CHUNK(5)
400EXT_CHUNK(6)
401EXT_CHUNK(7)
402EXT_CHUNK(8)
403EXT_CHUNK(9)
404EXT_CHUNK(10)
405EXT_CHUNK(11)
406EXT_CHUNK(12)
407EXT_CHUNK(13)
408EXT_CHUNK(14)
409EXT_CHUNK(15)
410
411L(move_long):
412 .p2align 4
4131:
414 cbz tmp1, 3f
415
416 add srcend, src, count
417 add dstend, dstin, count
418
419 and tmp1, srcend, 15
420 ldr D_q, [srcend, -16]
421 sub srcend, srcend, tmp1
422 sub count, count, tmp1
423 ldp A_q, B_q, [srcend, -32]
424 str D_q, [dstend, -16]
425 ldp C_q, D_q, [srcend, -64]!
426 sub dstend, dstend, tmp1
427 subs count, count, 128
428 b.ls 2f
429
430 .p2align 4
4311:
432 subs count, count, 64
433 stp A_q, B_q, [dstend, -32]
434 ldp A_q, B_q, [srcend, -32]
435 stp C_q, D_q, [dstend, -64]!
436 ldp C_q, D_q, [srcend, -64]!
437 b.hi 1b
438
439 /* Write the last full set of 64 bytes. The remainder is at most 64
440 bytes, so it is safe to always copy 64 bytes from the start even if
441 there is just 1 byte left. */
4422:
443 ldp E_q, F_q, [src, 32]
444 ldp G_q, H_q, [src]
445 stp A_q, B_q, [dstend, -32]
446 stp C_q, D_q, [dstend, -64]
447 stp E_q, F_q, [dstin, 32]
448 stp G_q, H_q, [dstin]
4493: ret
450
451
452END (MEMCPY)
453 .section .rodata
454 .p2align 4
455
456L(ext_table):
457 /* The first entry is for the alignment of 0 and is never
458 actually used (could be any value). */
459 .word 0
460 .word L(ext_size_1) -.
461 .word L(ext_size_2) -.
462 .word L(ext_size_3) -.
463 .word L(ext_size_4) -.
464 .word L(ext_size_5) -.
465 .word L(ext_size_6) -.
466 .word L(ext_size_7) -.
467 .word L(ext_size_8) -.
468 .word L(ext_size_9) -.
469 .word L(ext_size_10) -.
470 .word L(ext_size_11) -.
471 .word L(ext_size_12) -.
472 .word L(ext_size_13) -.
473 .word L(ext_size_14) -.
474 .word L(ext_size_15) -.
475
476libc_hidden_builtin_def (MEMCPY)
477#endif
478

source code of glibc/sysdeps/aarch64/multiarch/memcpy_thunderx2.S