1 | /* A Thunderx Optimized memcpy implementation for AARCH64. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | /* The actual code in this memcpy and memmove should be identical to the |
21 | generic version except for the code under '#ifdef THUNDERX'. This is |
22 | to make is easier to keep this version and the generic version in sync |
23 | for changes that are not specific to thunderx. */ |
24 | |
25 | #include <sysdep.h> |
26 | |
27 | /* Assumptions: |
28 | * |
29 | * ARMv8-a, AArch64, unaligned accesses. |
30 | * |
31 | */ |
32 | |
33 | #define dstin x0 |
34 | #define src x1 |
35 | #define count x2 |
36 | #define dst x3 |
37 | #define srcend x4 |
38 | #define dstend x5 |
39 | #define A_l x6 |
40 | #define A_lw w6 |
41 | #define A_h x7 |
42 | #define A_hw w7 |
43 | #define B_l x8 |
44 | #define B_lw w8 |
45 | #define B_h x9 |
46 | #define C_l x10 |
47 | #define C_h x11 |
48 | #define D_l x12 |
49 | #define D_h x13 |
50 | #define E_l src |
51 | #define E_h count |
52 | #define F_l srcend |
53 | #define F_h dst |
54 | #define G_l count |
55 | #define G_h dst |
56 | #define tmp1 x14 |
57 | |
58 | /* Copies are split into 3 main cases: small copies of up to 16 bytes, |
59 | medium copies of 17..96 bytes which are fully unrolled. Large copies |
60 | of more than 96 bytes align the destination and use an unrolled loop |
61 | processing 64 bytes per iteration. |
62 | In order to share code with memmove, small and medium copies read all |
63 | data before writing, allowing any kind of overlap. So small, medium |
64 | and large backwards memmoves are handled by falling through into memcpy. |
65 | Overlapping large forward memmoves use a loop that copies backwards. |
66 | */ |
67 | |
68 | #ifndef MEMMOVE |
69 | # define MEMMOVE memmove |
70 | #endif |
71 | #ifndef MEMCPY |
72 | # define MEMCPY memcpy |
73 | #endif |
74 | |
75 | #if IS_IN (libc) |
76 | |
77 | # undef MEMCPY |
78 | # define MEMCPY __memcpy_thunderx |
79 | # undef MEMMOVE |
80 | # define MEMMOVE __memmove_thunderx |
81 | |
82 | ENTRY_ALIGN (MEMMOVE, 6) |
83 | |
84 | PTR_ARG (0) |
85 | PTR_ARG (1) |
86 | SIZE_ARG (2) |
87 | |
88 | sub tmp1, dstin, src |
89 | cmp count, 96 |
90 | ccmp tmp1, count, 2, hi |
91 | b.lo L(move_long) |
92 | |
93 | /* Common case falls through into memcpy. */ |
94 | END (MEMMOVE) |
95 | libc_hidden_builtin_def (MEMMOVE) |
96 | ENTRY (MEMCPY) |
97 | |
98 | PTR_ARG (0) |
99 | PTR_ARG (1) |
100 | SIZE_ARG (2) |
101 | |
102 | prfm PLDL1KEEP, [src] |
103 | add srcend, src, count |
104 | add dstend, dstin, count |
105 | cmp count, 16 |
106 | b.ls L(copy16) |
107 | cmp count, 96 |
108 | b.hi L(copy_long) |
109 | |
110 | /* Medium copies: 17..96 bytes. */ |
111 | sub tmp1, count, 1 |
112 | ldp A_l, A_h, [src] |
113 | tbnz tmp1, 6, L(copy96) |
114 | ldp D_l, D_h, [srcend, -16] |
115 | tbz tmp1, 5, 1f |
116 | ldp B_l, B_h, [src, 16] |
117 | ldp C_l, C_h, [srcend, -32] |
118 | stp B_l, B_h, [dstin, 16] |
119 | stp C_l, C_h, [dstend, -32] |
120 | 1: |
121 | stp A_l, A_h, [dstin] |
122 | stp D_l, D_h, [dstend, -16] |
123 | ret |
124 | |
125 | .p2align 4 |
126 | /* Small copies: 0..16 bytes. */ |
127 | L(copy16): |
128 | cmp count, 8 |
129 | b.lo 1f |
130 | ldr A_l, [src] |
131 | ldr A_h, [srcend, -8] |
132 | str A_l, [dstin] |
133 | str A_h, [dstend, -8] |
134 | ret |
135 | .p2align 4 |
136 | 1: |
137 | tbz count, 2, 1f |
138 | ldr A_lw, [src] |
139 | ldr A_hw, [srcend, -4] |
140 | str A_lw, [dstin] |
141 | str A_hw, [dstend, -4] |
142 | ret |
143 | |
144 | /* Copy 0..3 bytes. Use a branchless sequence that copies the same |
145 | byte 3 times if count==1, or the 2nd byte twice if count==2. */ |
146 | 1: |
147 | cbz count, 2f |
148 | lsr tmp1, count, 1 |
149 | ldrb A_lw, [src] |
150 | ldrb A_hw, [srcend, -1] |
151 | ldrb B_lw, [src, tmp1] |
152 | strb A_lw, [dstin] |
153 | strb B_lw, [dstin, tmp1] |
154 | strb A_hw, [dstend, -1] |
155 | 2: ret |
156 | |
157 | .p2align 4 |
158 | /* Copy 64..96 bytes. Copy 64 bytes from the start and |
159 | 32 bytes from the end. */ |
160 | L(copy96): |
161 | ldp B_l, B_h, [src, 16] |
162 | ldp C_l, C_h, [src, 32] |
163 | ldp D_l, D_h, [src, 48] |
164 | ldp E_l, E_h, [srcend, -32] |
165 | ldp F_l, F_h, [srcend, -16] |
166 | stp A_l, A_h, [dstin] |
167 | stp B_l, B_h, [dstin, 16] |
168 | stp C_l, C_h, [dstin, 32] |
169 | stp D_l, D_h, [dstin, 48] |
170 | stp E_l, E_h, [dstend, -32] |
171 | stp F_l, F_h, [dstend, -16] |
172 | ret |
173 | |
174 | /* Align DST to 16 byte alignment so that we don't cross cache line |
175 | boundaries on both loads and stores. There are at least 96 bytes |
176 | to copy, so copy 16 bytes unaligned and then align. The loop |
177 | copies 64 bytes per iteration and prefetches one iteration ahead. */ |
178 | |
179 | .p2align 4 |
180 | L(copy_long): |
181 | |
182 | /* On thunderx, large memcpy's are helped by software prefetching. |
183 | This loop is identical to the one below it but with prefetching |
184 | instructions included. For loops that are less than 32768 bytes, |
185 | the prefetching does not help and slow the code down so we only |
186 | use the prefetching loop for the largest memcpys. */ |
187 | |
188 | cmp count, #32768 |
189 | b.lo L(copy_long_without_prefetch) |
190 | and tmp1, dstin, 15 |
191 | bic dst, dstin, 15 |
192 | ldp D_l, D_h, [src] |
193 | sub src, src, tmp1 |
194 | prfm pldl1strm, [src, 384] |
195 | add count, count, tmp1 /* Count is now 16 too large. */ |
196 | ldp A_l, A_h, [src, 16] |
197 | stp D_l, D_h, [dstin] |
198 | ldp B_l, B_h, [src, 32] |
199 | ldp C_l, C_h, [src, 48] |
200 | ldp D_l, D_h, [src, 64]! |
201 | subs count, count, 128 + 16 /* Test and readjust count. */ |
202 | |
203 | L(prefetch_loop64): |
204 | tbz src, #6, 1f |
205 | prfm pldl1strm, [src, 512] |
206 | 1: |
207 | stp A_l, A_h, [dst, 16] |
208 | ldp A_l, A_h, [src, 16] |
209 | stp B_l, B_h, [dst, 32] |
210 | ldp B_l, B_h, [src, 32] |
211 | stp C_l, C_h, [dst, 48] |
212 | ldp C_l, C_h, [src, 48] |
213 | stp D_l, D_h, [dst, 64]! |
214 | ldp D_l, D_h, [src, 64]! |
215 | subs count, count, 64 |
216 | b.hi L(prefetch_loop64) |
217 | b L(last64) |
218 | |
219 | L(copy_long_without_prefetch): |
220 | |
221 | and tmp1, dstin, 15 |
222 | bic dst, dstin, 15 |
223 | ldp D_l, D_h, [src] |
224 | sub src, src, tmp1 |
225 | add count, count, tmp1 /* Count is now 16 too large. */ |
226 | ldp A_l, A_h, [src, 16] |
227 | stp D_l, D_h, [dstin] |
228 | ldp B_l, B_h, [src, 32] |
229 | ldp C_l, C_h, [src, 48] |
230 | ldp D_l, D_h, [src, 64]! |
231 | subs count, count, 128 + 16 /* Test and readjust count. */ |
232 | b.ls L(last64) |
233 | L(loop64): |
234 | stp A_l, A_h, [dst, 16] |
235 | ldp A_l, A_h, [src, 16] |
236 | stp B_l, B_h, [dst, 32] |
237 | ldp B_l, B_h, [src, 32] |
238 | stp C_l, C_h, [dst, 48] |
239 | ldp C_l, C_h, [src, 48] |
240 | stp D_l, D_h, [dst, 64]! |
241 | ldp D_l, D_h, [src, 64]! |
242 | subs count, count, 64 |
243 | b.hi L(loop64) |
244 | |
245 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
246 | bytes, so it is safe to always copy 64 bytes from the end even if |
247 | there is just 1 byte left. */ |
248 | L(last64): |
249 | ldp E_l, E_h, [srcend, -64] |
250 | stp A_l, A_h, [dst, 16] |
251 | ldp A_l, A_h, [srcend, -48] |
252 | stp B_l, B_h, [dst, 32] |
253 | ldp B_l, B_h, [srcend, -32] |
254 | stp C_l, C_h, [dst, 48] |
255 | ldp C_l, C_h, [srcend, -16] |
256 | stp D_l, D_h, [dst, 64] |
257 | stp E_l, E_h, [dstend, -64] |
258 | stp A_l, A_h, [dstend, -48] |
259 | stp B_l, B_h, [dstend, -32] |
260 | stp C_l, C_h, [dstend, -16] |
261 | ret |
262 | |
263 | .p2align 4 |
264 | L(move_long): |
265 | cbz tmp1, 3f |
266 | |
267 | add srcend, src, count |
268 | add dstend, dstin, count |
269 | |
270 | /* Align dstend to 16 byte alignment so that we don't cross cache line |
271 | boundaries on both loads and stores. There are at least 96 bytes |
272 | to copy, so copy 16 bytes unaligned and then align. The loop |
273 | copies 64 bytes per iteration and prefetches one iteration ahead. */ |
274 | |
275 | and tmp1, dstend, 15 |
276 | ldp D_l, D_h, [srcend, -16] |
277 | sub srcend, srcend, tmp1 |
278 | sub count, count, tmp1 |
279 | ldp A_l, A_h, [srcend, -16] |
280 | stp D_l, D_h, [dstend, -16] |
281 | ldp B_l, B_h, [srcend, -32] |
282 | ldp C_l, C_h, [srcend, -48] |
283 | ldp D_l, D_h, [srcend, -64]! |
284 | sub dstend, dstend, tmp1 |
285 | subs count, count, 128 |
286 | b.ls 2f |
287 | |
288 | nop |
289 | 1: |
290 | stp A_l, A_h, [dstend, -16] |
291 | ldp A_l, A_h, [srcend, -16] |
292 | stp B_l, B_h, [dstend, -32] |
293 | ldp B_l, B_h, [srcend, -32] |
294 | stp C_l, C_h, [dstend, -48] |
295 | ldp C_l, C_h, [srcend, -48] |
296 | stp D_l, D_h, [dstend, -64]! |
297 | ldp D_l, D_h, [srcend, -64]! |
298 | subs count, count, 64 |
299 | b.hi 1b |
300 | |
301 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
302 | bytes, so it is safe to always copy 64 bytes from the start even if |
303 | there is just 1 byte left. */ |
304 | 2: |
305 | ldp G_l, G_h, [src, 48] |
306 | stp A_l, A_h, [dstend, -16] |
307 | ldp A_l, A_h, [src, 32] |
308 | stp B_l, B_h, [dstend, -32] |
309 | ldp B_l, B_h, [src, 16] |
310 | stp C_l, C_h, [dstend, -48] |
311 | ldp C_l, C_h, [src] |
312 | stp D_l, D_h, [dstend, -64] |
313 | stp G_l, G_h, [dstin, 48] |
314 | stp A_l, A_h, [dstin, 32] |
315 | stp B_l, B_h, [dstin, 16] |
316 | stp C_l, C_h, [dstin] |
317 | 3: ret |
318 | |
319 | END (MEMCPY) |
320 | libc_hidden_builtin_def (MEMCPY) |
321 | |
322 | #endif |
323 | |