1/* A Thunderx Optimized memcpy implementation for AARCH64.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20/* The actual code in this memcpy and memmove should be identical to the
21 generic version except for the code under '#ifdef THUNDERX'. This is
22 to make is easier to keep this version and the generic version in sync
23 for changes that are not specific to thunderx. */
24
25#include <sysdep.h>
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64, unaligned accesses.
30 *
31 */
32
33#define dstin x0
34#define src x1
35#define count x2
36#define dst x3
37#define srcend x4
38#define dstend x5
39#define A_l x6
40#define A_lw w6
41#define A_h x7
42#define A_hw w7
43#define B_l x8
44#define B_lw w8
45#define B_h x9
46#define C_l x10
47#define C_h x11
48#define D_l x12
49#define D_h x13
50#define E_l src
51#define E_h count
52#define F_l srcend
53#define F_h dst
54#define G_l count
55#define G_h dst
56#define tmp1 x14
57
58/* Copies are split into 3 main cases: small copies of up to 16 bytes,
59 medium copies of 17..96 bytes which are fully unrolled. Large copies
60 of more than 96 bytes align the destination and use an unrolled loop
61 processing 64 bytes per iteration.
62 In order to share code with memmove, small and medium copies read all
63 data before writing, allowing any kind of overlap. So small, medium
64 and large backwards memmoves are handled by falling through into memcpy.
65 Overlapping large forward memmoves use a loop that copies backwards.
66*/
67
68#ifndef MEMMOVE
69# define MEMMOVE memmove
70#endif
71#ifndef MEMCPY
72# define MEMCPY memcpy
73#endif
74
75#if IS_IN (libc)
76
77# undef MEMCPY
78# define MEMCPY __memcpy_thunderx
79# undef MEMMOVE
80# define MEMMOVE __memmove_thunderx
81
82ENTRY_ALIGN (MEMMOVE, 6)
83
84 PTR_ARG (0)
85 PTR_ARG (1)
86 SIZE_ARG (2)
87
88 sub tmp1, dstin, src
89 cmp count, 96
90 ccmp tmp1, count, 2, hi
91 b.lo L(move_long)
92
93 /* Common case falls through into memcpy. */
94END (MEMMOVE)
95libc_hidden_builtin_def (MEMMOVE)
96ENTRY (MEMCPY)
97
98 PTR_ARG (0)
99 PTR_ARG (1)
100 SIZE_ARG (2)
101
102 prfm PLDL1KEEP, [src]
103 add srcend, src, count
104 add dstend, dstin, count
105 cmp count, 16
106 b.ls L(copy16)
107 cmp count, 96
108 b.hi L(copy_long)
109
110 /* Medium copies: 17..96 bytes. */
111 sub tmp1, count, 1
112 ldp A_l, A_h, [src]
113 tbnz tmp1, 6, L(copy96)
114 ldp D_l, D_h, [srcend, -16]
115 tbz tmp1, 5, 1f
116 ldp B_l, B_h, [src, 16]
117 ldp C_l, C_h, [srcend, -32]
118 stp B_l, B_h, [dstin, 16]
119 stp C_l, C_h, [dstend, -32]
1201:
121 stp A_l, A_h, [dstin]
122 stp D_l, D_h, [dstend, -16]
123 ret
124
125 .p2align 4
126 /* Small copies: 0..16 bytes. */
127L(copy16):
128 cmp count, 8
129 b.lo 1f
130 ldr A_l, [src]
131 ldr A_h, [srcend, -8]
132 str A_l, [dstin]
133 str A_h, [dstend, -8]
134 ret
135 .p2align 4
1361:
137 tbz count, 2, 1f
138 ldr A_lw, [src]
139 ldr A_hw, [srcend, -4]
140 str A_lw, [dstin]
141 str A_hw, [dstend, -4]
142 ret
143
144 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
145 byte 3 times if count==1, or the 2nd byte twice if count==2. */
1461:
147 cbz count, 2f
148 lsr tmp1, count, 1
149 ldrb A_lw, [src]
150 ldrb A_hw, [srcend, -1]
151 ldrb B_lw, [src, tmp1]
152 strb A_lw, [dstin]
153 strb B_lw, [dstin, tmp1]
154 strb A_hw, [dstend, -1]
1552: ret
156
157 .p2align 4
158 /* Copy 64..96 bytes. Copy 64 bytes from the start and
159 32 bytes from the end. */
160L(copy96):
161 ldp B_l, B_h, [src, 16]
162 ldp C_l, C_h, [src, 32]
163 ldp D_l, D_h, [src, 48]
164 ldp E_l, E_h, [srcend, -32]
165 ldp F_l, F_h, [srcend, -16]
166 stp A_l, A_h, [dstin]
167 stp B_l, B_h, [dstin, 16]
168 stp C_l, C_h, [dstin, 32]
169 stp D_l, D_h, [dstin, 48]
170 stp E_l, E_h, [dstend, -32]
171 stp F_l, F_h, [dstend, -16]
172 ret
173
174 /* Align DST to 16 byte alignment so that we don't cross cache line
175 boundaries on both loads and stores. There are at least 96 bytes
176 to copy, so copy 16 bytes unaligned and then align. The loop
177 copies 64 bytes per iteration and prefetches one iteration ahead. */
178
179 .p2align 4
180L(copy_long):
181
182 /* On thunderx, large memcpy's are helped by software prefetching.
183 This loop is identical to the one below it but with prefetching
184 instructions included. For loops that are less than 32768 bytes,
185 the prefetching does not help and slow the code down so we only
186 use the prefetching loop for the largest memcpys. */
187
188 cmp count, #32768
189 b.lo L(copy_long_without_prefetch)
190 and tmp1, dstin, 15
191 bic dst, dstin, 15
192 ldp D_l, D_h, [src]
193 sub src, src, tmp1
194 prfm pldl1strm, [src, 384]
195 add count, count, tmp1 /* Count is now 16 too large. */
196 ldp A_l, A_h, [src, 16]
197 stp D_l, D_h, [dstin]
198 ldp B_l, B_h, [src, 32]
199 ldp C_l, C_h, [src, 48]
200 ldp D_l, D_h, [src, 64]!
201 subs count, count, 128 + 16 /* Test and readjust count. */
202
203L(prefetch_loop64):
204 tbz src, #6, 1f
205 prfm pldl1strm, [src, 512]
2061:
207 stp A_l, A_h, [dst, 16]
208 ldp A_l, A_h, [src, 16]
209 stp B_l, B_h, [dst, 32]
210 ldp B_l, B_h, [src, 32]
211 stp C_l, C_h, [dst, 48]
212 ldp C_l, C_h, [src, 48]
213 stp D_l, D_h, [dst, 64]!
214 ldp D_l, D_h, [src, 64]!
215 subs count, count, 64
216 b.hi L(prefetch_loop64)
217 b L(last64)
218
219L(copy_long_without_prefetch):
220
221 and tmp1, dstin, 15
222 bic dst, dstin, 15
223 ldp D_l, D_h, [src]
224 sub src, src, tmp1
225 add count, count, tmp1 /* Count is now 16 too large. */
226 ldp A_l, A_h, [src, 16]
227 stp D_l, D_h, [dstin]
228 ldp B_l, B_h, [src, 32]
229 ldp C_l, C_h, [src, 48]
230 ldp D_l, D_h, [src, 64]!
231 subs count, count, 128 + 16 /* Test and readjust count. */
232 b.ls L(last64)
233L(loop64):
234 stp A_l, A_h, [dst, 16]
235 ldp A_l, A_h, [src, 16]
236 stp B_l, B_h, [dst, 32]
237 ldp B_l, B_h, [src, 32]
238 stp C_l, C_h, [dst, 48]
239 ldp C_l, C_h, [src, 48]
240 stp D_l, D_h, [dst, 64]!
241 ldp D_l, D_h, [src, 64]!
242 subs count, count, 64
243 b.hi L(loop64)
244
245 /* Write the last full set of 64 bytes. The remainder is at most 64
246 bytes, so it is safe to always copy 64 bytes from the end even if
247 there is just 1 byte left. */
248L(last64):
249 ldp E_l, E_h, [srcend, -64]
250 stp A_l, A_h, [dst, 16]
251 ldp A_l, A_h, [srcend, -48]
252 stp B_l, B_h, [dst, 32]
253 ldp B_l, B_h, [srcend, -32]
254 stp C_l, C_h, [dst, 48]
255 ldp C_l, C_h, [srcend, -16]
256 stp D_l, D_h, [dst, 64]
257 stp E_l, E_h, [dstend, -64]
258 stp A_l, A_h, [dstend, -48]
259 stp B_l, B_h, [dstend, -32]
260 stp C_l, C_h, [dstend, -16]
261 ret
262
263 .p2align 4
264L(move_long):
265 cbz tmp1, 3f
266
267 add srcend, src, count
268 add dstend, dstin, count
269
270 /* Align dstend to 16 byte alignment so that we don't cross cache line
271 boundaries on both loads and stores. There are at least 96 bytes
272 to copy, so copy 16 bytes unaligned and then align. The loop
273 copies 64 bytes per iteration and prefetches one iteration ahead. */
274
275 and tmp1, dstend, 15
276 ldp D_l, D_h, [srcend, -16]
277 sub srcend, srcend, tmp1
278 sub count, count, tmp1
279 ldp A_l, A_h, [srcend, -16]
280 stp D_l, D_h, [dstend, -16]
281 ldp B_l, B_h, [srcend, -32]
282 ldp C_l, C_h, [srcend, -48]
283 ldp D_l, D_h, [srcend, -64]!
284 sub dstend, dstend, tmp1
285 subs count, count, 128
286 b.ls 2f
287
288 nop
2891:
290 stp A_l, A_h, [dstend, -16]
291 ldp A_l, A_h, [srcend, -16]
292 stp B_l, B_h, [dstend, -32]
293 ldp B_l, B_h, [srcend, -32]
294 stp C_l, C_h, [dstend, -48]
295 ldp C_l, C_h, [srcend, -48]
296 stp D_l, D_h, [dstend, -64]!
297 ldp D_l, D_h, [srcend, -64]!
298 subs count, count, 64
299 b.hi 1b
300
301 /* Write the last full set of 64 bytes. The remainder is at most 64
302 bytes, so it is safe to always copy 64 bytes from the start even if
303 there is just 1 byte left. */
3042:
305 ldp G_l, G_h, [src, 48]
306 stp A_l, A_h, [dstend, -16]
307 ldp A_l, A_h, [src, 32]
308 stp B_l, B_h, [dstend, -32]
309 ldp B_l, B_h, [src, 16]
310 stp C_l, C_h, [dstend, -48]
311 ldp C_l, C_h, [src]
312 stp D_l, D_h, [dstend, -64]
313 stp G_l, G_h, [dstin, 48]
314 stp A_l, A_h, [dstin, 32]
315 stp B_l, B_h, [dstin, 16]
316 stp C_l, C_h, [dstin]
3173: ret
318
319END (MEMCPY)
320libc_hidden_builtin_def (MEMCPY)
321
322#endif
323

source code of glibc/sysdeps/aarch64/multiarch/memcpy_thunderx.S