1 | /* A Thunderx2 Optimized memcpy implementation for AARCH64. |
2 | Copyright (C) 2018-2022 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | /* Assumptions: |
23 | * |
24 | * ARMv8-a, AArch64, unaligned accesses. |
25 | * |
26 | */ |
27 | |
28 | #define dstin x0 |
29 | #define src x1 |
30 | #define count x2 |
31 | #define dst x3 |
32 | #define srcend x4 |
33 | #define dstend x5 |
34 | #define tmp2 x6 |
35 | #define tmp3 x7 |
36 | #define tmp3w w7 |
37 | #define A_l x6 |
38 | #define A_lw w6 |
39 | #define A_h x7 |
40 | #define A_hw w7 |
41 | #define B_l x8 |
42 | #define B_lw w8 |
43 | #define B_h x9 |
44 | #define C_l x10 |
45 | #define C_h x11 |
46 | #define D_l x12 |
47 | #define D_h x13 |
48 | #define E_l src |
49 | #define E_h count |
50 | #define F_l srcend |
51 | #define F_h dst |
52 | #define G_l count |
53 | #define G_h dst |
54 | #define tmp1 x14 |
55 | |
56 | #define A_q q0 |
57 | #define B_q q1 |
58 | #define C_q q2 |
59 | #define D_q q3 |
60 | #define E_q q4 |
61 | #define F_q q5 |
62 | #define G_q q6 |
63 | #define H_q q7 |
64 | #define I_q q16 |
65 | #define J_q q17 |
66 | |
67 | #define A_v v0 |
68 | #define B_v v1 |
69 | #define C_v v2 |
70 | #define D_v v3 |
71 | #define E_v v4 |
72 | #define F_v v5 |
73 | #define G_v v6 |
74 | #define H_v v7 |
75 | #define I_v v16 |
76 | #define J_v v17 |
77 | |
78 | #ifndef MEMMOVE |
79 | # define MEMMOVE memmove |
80 | #endif |
81 | #ifndef MEMCPY |
82 | # define MEMCPY memcpy |
83 | #endif |
84 | |
85 | #if IS_IN (libc) |
86 | |
87 | #undef MEMCPY |
88 | #define MEMCPY __memcpy_thunderx2 |
89 | #undef MEMMOVE |
90 | #define MEMMOVE __memmove_thunderx2 |
91 | |
92 | |
93 | /* Overlapping large forward memmoves use a loop that copies backwards. |
94 | Otherwise memcpy is used. Small moves branch to memcopy16 directly. |
95 | The longer memcpy cases fall through to the memcpy head. |
96 | */ |
97 | |
98 | ENTRY_ALIGN (MEMMOVE, 6) |
99 | |
100 | PTR_ARG (0) |
101 | PTR_ARG (1) |
102 | SIZE_ARG (2) |
103 | |
104 | add srcend, src, count |
105 | cmp count, 16 |
106 | b.ls L(memcopy16) |
107 | sub tmp1, dstin, src |
108 | cmp count, 96 |
109 | ccmp tmp1, count, 2, hi |
110 | b.lo L(move_long) |
111 | |
112 | END (MEMMOVE) |
113 | libc_hidden_builtin_def (MEMMOVE) |
114 | |
115 | |
116 | /* Copies are split into 3 main cases: small copies of up to 16 bytes, |
117 | medium copies of 17..96 bytes which are fully unrolled. Large copies |
118 | of more than 96 bytes align the destination and use load-and-merge |
119 | approach in the case src and dst addresses are unaligned not evenly, |
120 | so that, actual loads and stores are always aligned. |
121 | Large copies use the loops processing 64 bytes per iteration for |
122 | unaligned case and 128 bytes per iteration for aligned ones. |
123 | */ |
124 | |
125 | #define MEMCPY_PREFETCH_LDR 640 |
126 | |
127 | .p2align 4 |
128 | ENTRY (MEMCPY) |
129 | |
130 | PTR_ARG (0) |
131 | PTR_ARG (1) |
132 | SIZE_ARG (2) |
133 | |
134 | add srcend, src, count |
135 | cmp count, 16 |
136 | b.ls L(memcopy16) |
137 | ldr A_q, [src], #16 |
138 | add dstend, dstin, count |
139 | and tmp1, src, 15 |
140 | cmp count, 96 |
141 | b.hi L(memcopy_long) |
142 | |
143 | /* Medium copies: 17..96 bytes. */ |
144 | ldr E_q, [srcend, -16] |
145 | cmp count, 64 |
146 | b.gt L(memcpy_copy96) |
147 | cmp count, 48 |
148 | b.le L(bytes_17_to_48) |
149 | /* 49..64 bytes */ |
150 | ldp B_q, C_q, [src] |
151 | str E_q, [dstend, -16] |
152 | stp A_q, B_q, [dstin] |
153 | str C_q, [dstin, 32] |
154 | ret |
155 | |
156 | L(bytes_17_to_48): |
157 | /* 17..48 bytes*/ |
158 | cmp count, 32 |
159 | b.gt L(bytes_32_to_48) |
160 | /* 17..32 bytes*/ |
161 | str A_q, [dstin] |
162 | str E_q, [dstend, -16] |
163 | ret |
164 | |
165 | L(bytes_32_to_48): |
166 | /* 32..48 */ |
167 | ldr B_q, [src] |
168 | str A_q, [dstin] |
169 | str E_q, [dstend, -16] |
170 | str B_q, [dstin, 16] |
171 | ret |
172 | |
173 | .p2align 4 |
174 | /* Small copies: 0..16 bytes. */ |
175 | L(memcopy16): |
176 | cmp count, 8 |
177 | b.lo L(bytes_0_to_8) |
178 | ldr A_l, [src] |
179 | ldr A_h, [srcend, -8] |
180 | add dstend, dstin, count |
181 | str A_l, [dstin] |
182 | str A_h, [dstend, -8] |
183 | ret |
184 | .p2align 4 |
185 | |
186 | L(bytes_0_to_8): |
187 | tbz count, 2, L(bytes_0_to_3) |
188 | ldr A_lw, [src] |
189 | ldr A_hw, [srcend, -4] |
190 | add dstend, dstin, count |
191 | str A_lw, [dstin] |
192 | str A_hw, [dstend, -4] |
193 | ret |
194 | |
195 | /* Copy 0..3 bytes. Use a branchless sequence that copies the same |
196 | byte 3 times if count==1, or the 2nd byte twice if count==2. */ |
197 | L(bytes_0_to_3): |
198 | cbz count, 1f |
199 | lsr tmp1, count, 1 |
200 | ldrb A_lw, [src] |
201 | ldrb A_hw, [srcend, -1] |
202 | add dstend, dstin, count |
203 | ldrb B_lw, [src, tmp1] |
204 | strb B_lw, [dstin, tmp1] |
205 | strb A_hw, [dstend, -1] |
206 | strb A_lw, [dstin] |
207 | 1: |
208 | ret |
209 | |
210 | .p2align 4 |
211 | |
212 | L(memcpy_copy96): |
213 | /* Copying 65..96 bytes. A_q (first 16 bytes) and |
214 | E_q(last 16 bytes) are already loaded. The size |
215 | is large enough to benefit from aligned loads */ |
216 | bic src, src, 15 |
217 | ldp B_q, C_q, [src] |
218 | /* Loaded 64 bytes, second 16-bytes chunk can be |
219 | overlapping with the first chunk by tmp1 bytes. |
220 | Stored 16 bytes. */ |
221 | sub dst, dstin, tmp1 |
222 | add count, count, tmp1 |
223 | /* The range of count being [65..96] becomes [65..111] |
224 | after tmp [0..15] gets added to it, |
225 | count now is <bytes-left-to-load>+48 */ |
226 | cmp count, 80 |
227 | b.gt L(copy96_medium) |
228 | ldr D_q, [src, 32] |
229 | stp B_q, C_q, [dst, 16] |
230 | str D_q, [dst, 48] |
231 | str A_q, [dstin] |
232 | str E_q, [dstend, -16] |
233 | ret |
234 | |
235 | .p2align 4 |
236 | L(copy96_medium): |
237 | ldp D_q, G_q, [src, 32] |
238 | cmp count, 96 |
239 | b.gt L(copy96_large) |
240 | stp B_q, C_q, [dst, 16] |
241 | stp D_q, G_q, [dst, 48] |
242 | str A_q, [dstin] |
243 | str E_q, [dstend, -16] |
244 | ret |
245 | |
246 | L(copy96_large): |
247 | ldr F_q, [src, 64] |
248 | str B_q, [dst, 16] |
249 | stp C_q, D_q, [dst, 32] |
250 | stp G_q, F_q, [dst, 64] |
251 | str A_q, [dstin] |
252 | str E_q, [dstend, -16] |
253 | ret |
254 | |
255 | .p2align 4 |
256 | L(memcopy_long): |
257 | bic src, src, 15 |
258 | ldp B_q, C_q, [src], #32 |
259 | sub dst, dstin, tmp1 |
260 | add count, count, tmp1 |
261 | add dst, dst, 16 |
262 | and tmp1, dst, 15 |
263 | ldp D_q, E_q, [src], #32 |
264 | str A_q, [dstin] |
265 | |
266 | /* Already loaded 64+16 bytes. Check if at |
267 | least 64 more bytes left */ |
268 | subs count, count, 64+64+16 |
269 | b.lt L(loop128_exit0) |
270 | cmp count, MEMCPY_PREFETCH_LDR + 64 + 32 |
271 | b.lt L(loop128) |
272 | cbnz tmp1, L(dst_unaligned) |
273 | sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 |
274 | |
275 | .p2align 4 |
276 | |
277 | L(loop128_prefetch): |
278 | prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] |
279 | ldp F_q, G_q, [src], #32 |
280 | stp B_q, C_q, [dst], #32 |
281 | ldp H_q, I_q, [src], #32 |
282 | prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] |
283 | ldp B_q, C_q, [src], #32 |
284 | stp D_q, E_q, [dst], #32 |
285 | ldp D_q, E_q, [src], #32 |
286 | stp F_q, G_q, [dst], #32 |
287 | stp H_q, I_q, [dst], #32 |
288 | subs count, count, 128 |
289 | b.ge L(loop128_prefetch) |
290 | |
291 | add count, count, MEMCPY_PREFETCH_LDR + 64 + 32 |
292 | .p2align 4 |
293 | L(loop128): |
294 | ldp F_q, G_q, [src], #32 |
295 | ldp H_q, I_q, [src], #32 |
296 | stp B_q, C_q, [dst], #32 |
297 | stp D_q, E_q, [dst], #32 |
298 | subs count, count, 64 |
299 | b.lt L(loop128_exit1) |
300 | ldp B_q, C_q, [src], #32 |
301 | ldp D_q, E_q, [src], #32 |
302 | stp F_q, G_q, [dst], #32 |
303 | stp H_q, I_q, [dst], #32 |
304 | subs count, count, 64 |
305 | b.ge L(loop128) |
306 | L(loop128_exit0): |
307 | ldp F_q, G_q, [srcend, -64] |
308 | ldp H_q, I_q, [srcend, -32] |
309 | stp B_q, C_q, [dst], #32 |
310 | stp D_q, E_q, [dst] |
311 | stp F_q, G_q, [dstend, -64] |
312 | stp H_q, I_q, [dstend, -32] |
313 | ret |
314 | L(loop128_exit1): |
315 | ldp B_q, C_q, [srcend, -64] |
316 | ldp D_q, E_q, [srcend, -32] |
317 | stp F_q, G_q, [dst], #32 |
318 | stp H_q, I_q, [dst] |
319 | stp B_q, C_q, [dstend, -64] |
320 | stp D_q, E_q, [dstend, -32] |
321 | ret |
322 | |
323 | L(dst_unaligned_tail): |
324 | ldp C_q, D_q, [srcend, -64] |
325 | ldp E_q, F_q, [srcend, -32] |
326 | stp A_q, B_q, [dst], #32 |
327 | stp H_q, I_q, [dst], #16 |
328 | str G_q, [dst, tmp1] |
329 | stp C_q, D_q, [dstend, -64] |
330 | stp E_q, F_q, [dstend, -32] |
331 | ret |
332 | |
333 | L(dst_unaligned): |
334 | /* For the unaligned store case the code loads two |
335 | aligned chunks and then merges them using ext |
336 | instruction. This can be up to 30% faster than |
337 | the the simple unaligned store access. |
338 | |
339 | Current state: tmp1 = dst % 16; C_q, D_q, E_q |
340 | contains data yet to be stored. src and dst points |
341 | to next-to-be-processed data. A_q, B_q contains |
342 | data already stored before, count = bytes left to |
343 | be load decremented by 64. |
344 | |
345 | The control is passed here if at least 64 bytes left |
346 | to be loaded. The code does two aligned loads and then |
347 | extracts (16-tmp1) bytes from the first register and |
348 | tmp1 bytes from the next register forming the value |
349 | for the aligned store. |
350 | |
351 | As ext instruction can only have it's index encoded |
352 | as immediate. 15 code chunks process each possible |
353 | index value. Computed goto is used to reach the |
354 | required code. */ |
355 | |
356 | /* Store the 16 bytes to dst and align dst for further |
357 | operations, several bytes will be stored at this |
358 | address once more */ |
359 | |
360 | ldp F_q, G_q, [src], #32 |
361 | stp B_q, C_q, [dst], #32 |
362 | bic dst, dst, 15 |
363 | sub count, count, 32 |
364 | adrp tmp2, L(ext_table) |
365 | add tmp2, tmp2, :lo12:L(ext_table) |
366 | add tmp2, tmp2, tmp1, LSL #2 |
367 | ldr tmp3w, [tmp2] |
368 | add tmp2, tmp2, tmp3w, SXTW |
369 | br tmp2 |
370 | |
371 | .p2align 4 |
372 | /* to make the loop in each chunk 16-bytes aligned */ |
373 | nop |
374 | #define EXT_CHUNK(shft) \ |
375 | L(ext_size_ ## shft):;\ |
376 | ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\ |
377 | ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\ |
378 | ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\ |
379 | 1:;\ |
380 | stp A_q, B_q, [dst], #32;\ |
381 | prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\ |
382 | ldp C_q, D_q, [src], #32;\ |
383 | ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ |
384 | stp H_q, I_q, [dst], #32;\ |
385 | ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\ |
386 | ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\ |
387 | ldp F_q, G_q, [src], #32;\ |
388 | ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\ |
389 | subs count, count, 64;\ |
390 | b.ge 1b;\ |
391 | 2:;\ |
392 | ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ |
393 | b L(dst_unaligned_tail); |
394 | |
395 | EXT_CHUNK(1) |
396 | EXT_CHUNK(2) |
397 | EXT_CHUNK(3) |
398 | EXT_CHUNK(4) |
399 | EXT_CHUNK(5) |
400 | EXT_CHUNK(6) |
401 | EXT_CHUNK(7) |
402 | EXT_CHUNK(8) |
403 | EXT_CHUNK(9) |
404 | EXT_CHUNK(10) |
405 | EXT_CHUNK(11) |
406 | EXT_CHUNK(12) |
407 | EXT_CHUNK(13) |
408 | EXT_CHUNK(14) |
409 | EXT_CHUNK(15) |
410 | |
411 | L(move_long): |
412 | .p2align 4 |
413 | 1: |
414 | cbz tmp1, 3f |
415 | |
416 | add srcend, src, count |
417 | add dstend, dstin, count |
418 | |
419 | and tmp1, srcend, 15 |
420 | ldr D_q, [srcend, -16] |
421 | sub srcend, srcend, tmp1 |
422 | sub count, count, tmp1 |
423 | ldp A_q, B_q, [srcend, -32] |
424 | str D_q, [dstend, -16] |
425 | ldp C_q, D_q, [srcend, -64]! |
426 | sub dstend, dstend, tmp1 |
427 | subs count, count, 128 |
428 | b.ls 2f |
429 | |
430 | .p2align 4 |
431 | 1: |
432 | subs count, count, 64 |
433 | stp A_q, B_q, [dstend, -32] |
434 | ldp A_q, B_q, [srcend, -32] |
435 | stp C_q, D_q, [dstend, -64]! |
436 | ldp C_q, D_q, [srcend, -64]! |
437 | b.hi 1b |
438 | |
439 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
440 | bytes, so it is safe to always copy 64 bytes from the start even if |
441 | there is just 1 byte left. */ |
442 | 2: |
443 | ldp E_q, F_q, [src, 32] |
444 | ldp G_q, H_q, [src] |
445 | stp A_q, B_q, [dstend, -32] |
446 | stp C_q, D_q, [dstend, -64] |
447 | stp E_q, F_q, [dstin, 32] |
448 | stp G_q, H_q, [dstin] |
449 | 3: ret |
450 | |
451 | |
452 | END (MEMCPY) |
453 | .section .rodata |
454 | .p2align 4 |
455 | |
456 | L(ext_table): |
457 | /* The first entry is for the alignment of 0 and is never |
458 | actually used (could be any value). */ |
459 | .word 0 |
460 | .word L(ext_size_1) -. |
461 | .word L(ext_size_2) -. |
462 | .word L(ext_size_3) -. |
463 | .word L(ext_size_4) -. |
464 | .word L(ext_size_5) -. |
465 | .word L(ext_size_6) -. |
466 | .word L(ext_size_7) -. |
467 | .word L(ext_size_8) -. |
468 | .word L(ext_size_9) -. |
469 | .word L(ext_size_10) -. |
470 | .word L(ext_size_11) -. |
471 | .word L(ext_size_12) -. |
472 | .word L(ext_size_13) -. |
473 | .word L(ext_size_14) -. |
474 | .word L(ext_size_15) -. |
475 | |
476 | libc_hidden_builtin_def (MEMCPY) |
477 | #endif |
478 | |