1/* Optimized memcpy for Fujitsu A64FX processor.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22#undef BTI_C
23#define BTI_C
24
25/* Assumptions:
26 *
27 * ARMv8.2-a, AArch64, unaligned accesses, sve
28 *
29 */
30
31#define dstin x0
32#define src x1
33#define n x2
34#define dst x3
35#define dstend x4
36#define srcend x5
37#define tmp x6
38#define vlen x7
39#define vlen8 x8
40
41#if HAVE_AARCH64_SVE_ASM
42
43 .arch armv8.2-a+sve
44
45 .macro ld1b_unroll8
46 ld1b z0.b, p0/z, [src, 0, mul vl]
47 ld1b z1.b, p0/z, [src, 1, mul vl]
48 ld1b z2.b, p0/z, [src, 2, mul vl]
49 ld1b z3.b, p0/z, [src, 3, mul vl]
50 ld1b z4.b, p0/z, [src, 4, mul vl]
51 ld1b z5.b, p0/z, [src, 5, mul vl]
52 ld1b z6.b, p0/z, [src, 6, mul vl]
53 ld1b z7.b, p0/z, [src, 7, mul vl]
54 .endm
55
56 .macro stld1b_unroll4a
57 st1b z0.b, p0, [dst, 0, mul vl]
58 st1b z1.b, p0, [dst, 1, mul vl]
59 ld1b z0.b, p0/z, [src, 0, mul vl]
60 ld1b z1.b, p0/z, [src, 1, mul vl]
61 st1b z2.b, p0, [dst, 2, mul vl]
62 st1b z3.b, p0, [dst, 3, mul vl]
63 ld1b z2.b, p0/z, [src, 2, mul vl]
64 ld1b z3.b, p0/z, [src, 3, mul vl]
65 .endm
66
67 .macro stld1b_unroll4b
68 st1b z4.b, p0, [dst, 4, mul vl]
69 st1b z5.b, p0, [dst, 5, mul vl]
70 ld1b z4.b, p0/z, [src, 4, mul vl]
71 ld1b z5.b, p0/z, [src, 5, mul vl]
72 st1b z6.b, p0, [dst, 6, mul vl]
73 st1b z7.b, p0, [dst, 7, mul vl]
74 ld1b z6.b, p0/z, [src, 6, mul vl]
75 ld1b z7.b, p0/z, [src, 7, mul vl]
76 .endm
77
78 .macro stld1b_unroll8
79 stld1b_unroll4a
80 stld1b_unroll4b
81 .endm
82
83 .macro st1b_unroll8
84 st1b z0.b, p0, [dst, 0, mul vl]
85 st1b z1.b, p0, [dst, 1, mul vl]
86 st1b z2.b, p0, [dst, 2, mul vl]
87 st1b z3.b, p0, [dst, 3, mul vl]
88 st1b z4.b, p0, [dst, 4, mul vl]
89 st1b z5.b, p0, [dst, 5, mul vl]
90 st1b z6.b, p0, [dst, 6, mul vl]
91 st1b z7.b, p0, [dst, 7, mul vl]
92 .endm
93
94#undef BTI_C
95#define BTI_C
96
97ENTRY (__memcpy_a64fx)
98
99 PTR_ARG (0)
100 PTR_ARG (1)
101 SIZE_ARG (2)
102
103 cntb vlen
104 cmp n, vlen, lsl 1
105 b.hi L(copy_small)
106 whilelo p1.b, vlen, n
107 whilelo p0.b, xzr, n
108 ld1b z0.b, p0/z, [src, 0, mul vl]
109 ld1b z1.b, p1/z, [src, 1, mul vl]
110 st1b z0.b, p0, [dstin, 0, mul vl]
111 st1b z1.b, p1, [dstin, 1, mul vl]
112 ret
113
114 .p2align 4
115
116L(copy_small):
117 cmp n, vlen, lsl 3
118 b.hi L(copy_large)
119 add dstend, dstin, n
120 add srcend, src, n
121 cmp n, vlen, lsl 2
122 b.hi 1f
123
124 /* Copy 2-4 vectors. */
125 ptrue p0.b
126 ld1b z0.b, p0/z, [src, 0, mul vl]
127 ld1b z1.b, p0/z, [src, 1, mul vl]
128 ld1b z2.b, p0/z, [srcend, -2, mul vl]
129 ld1b z3.b, p0/z, [srcend, -1, mul vl]
130 st1b z0.b, p0, [dstin, 0, mul vl]
131 st1b z1.b, p0, [dstin, 1, mul vl]
132 st1b z2.b, p0, [dstend, -2, mul vl]
133 st1b z3.b, p0, [dstend, -1, mul vl]
134 ret
135
136 .p2align 4
137 /* Copy 4-8 vectors. */
1381: ptrue p0.b
139 ld1b z0.b, p0/z, [src, 0, mul vl]
140 ld1b z1.b, p0/z, [src, 1, mul vl]
141 ld1b z2.b, p0/z, [src, 2, mul vl]
142 ld1b z3.b, p0/z, [src, 3, mul vl]
143 ld1b z4.b, p0/z, [srcend, -4, mul vl]
144 ld1b z5.b, p0/z, [srcend, -3, mul vl]
145 ld1b z6.b, p0/z, [srcend, -2, mul vl]
146 ld1b z7.b, p0/z, [srcend, -1, mul vl]
147 st1b z0.b, p0, [dstin, 0, mul vl]
148 st1b z1.b, p0, [dstin, 1, mul vl]
149 st1b z2.b, p0, [dstin, 2, mul vl]
150 st1b z3.b, p0, [dstin, 3, mul vl]
151 st1b z4.b, p0, [dstend, -4, mul vl]
152 st1b z5.b, p0, [dstend, -3, mul vl]
153 st1b z6.b, p0, [dstend, -2, mul vl]
154 st1b z7.b, p0, [dstend, -1, mul vl]
155 ret
156
157 .p2align 4
158 /* At least 8 vectors - always align to vector length for
159 higher and consistent write performance. */
160L(copy_large):
161 sub tmp, vlen, 1
162 and tmp, dstin, tmp
163 sub tmp, vlen, tmp
164 whilelo p1.b, xzr, tmp
165 ld1b z1.b, p1/z, [src]
166 st1b z1.b, p1, [dstin]
167 add dst, dstin, tmp
168 add src, src, tmp
169 sub n, n, tmp
170 ptrue p0.b
171
172 lsl vlen8, vlen, 3
173 subs n, n, vlen8
174 b.ls 3f
175 ld1b_unroll8
176 add src, src, vlen8
177 subs n, n, vlen8
178 b.ls 2f
179
180 .p2align 4
181 /* 8x unrolled and software pipelined loop. */
1821: stld1b_unroll8
183 add dst, dst, vlen8
184 add src, src, vlen8
185 subs n, n, vlen8
186 b.hi 1b
1872: st1b_unroll8
188 add dst, dst, vlen8
1893: add n, n, vlen8
190
191 /* Move last 0-8 vectors. */
192L(last_bytes):
193 cmp n, vlen, lsl 1
194 b.hi 1f
195 whilelo p0.b, xzr, n
196 whilelo p1.b, vlen, n
197 ld1b z0.b, p0/z, [src, 0, mul vl]
198 ld1b z1.b, p1/z, [src, 1, mul vl]
199 st1b z0.b, p0, [dst, 0, mul vl]
200 st1b z1.b, p1, [dst, 1, mul vl]
201 ret
202
203 .p2align 4
204
2051: add srcend, src, n
206 add dstend, dst, n
207 ld1b z0.b, p0/z, [src, 0, mul vl]
208 ld1b z1.b, p0/z, [src, 1, mul vl]
209 ld1b z2.b, p0/z, [srcend, -2, mul vl]
210 ld1b z3.b, p0/z, [srcend, -1, mul vl]
211 cmp n, vlen, lsl 2
212 b.hi 1f
213
214 st1b z0.b, p0, [dst, 0, mul vl]
215 st1b z1.b, p0, [dst, 1, mul vl]
216 st1b z2.b, p0, [dstend, -2, mul vl]
217 st1b z3.b, p0, [dstend, -1, mul vl]
218 ret
219
2201: ld1b z4.b, p0/z, [src, 2, mul vl]
221 ld1b z5.b, p0/z, [src, 3, mul vl]
222 ld1b z6.b, p0/z, [srcend, -4, mul vl]
223 ld1b z7.b, p0/z, [srcend, -3, mul vl]
224 st1b z0.b, p0, [dst, 0, mul vl]
225 st1b z1.b, p0, [dst, 1, mul vl]
226 st1b z4.b, p0, [dst, 2, mul vl]
227 st1b z5.b, p0, [dst, 3, mul vl]
228 st1b z6.b, p0, [dstend, -4, mul vl]
229 st1b z7.b, p0, [dstend, -3, mul vl]
230 st1b z2.b, p0, [dstend, -2, mul vl]
231 st1b z3.b, p0, [dstend, -1, mul vl]
232 ret
233
234END (__memcpy_a64fx)
235
236
237ENTRY_ALIGN (__memmove_a64fx, 4)
238
239 PTR_ARG (0)
240 PTR_ARG (1)
241 SIZE_ARG (2)
242
243 /* Fast case for up to 2 vectors. */
244 cntb vlen
245 cmp n, vlen, lsl 1
246 b.hi 1f
247 whilelo p0.b, xzr, n
248 whilelo p1.b, vlen, n
249 ld1b z0.b, p0/z, [src, 0, mul vl]
250 ld1b z1.b, p1/z, [src, 1, mul vl]
251 st1b z0.b, p0, [dstin, 0, mul vl]
252 st1b z1.b, p1, [dstin, 1, mul vl]
253L(full_overlap):
254 ret
255
256 .p2align 4
257 /* Check for overlapping moves. Return if there is a full overlap.
258 Small moves up to 8 vectors use the overlap-safe copy_small code.
259 Non-overlapping or overlapping moves with dst < src use memcpy.
260 Overlapping moves with dst > src use a backward copy loop. */
2611: sub tmp, dstin, src
262 ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */
263 b.eq L(full_overlap)
264 cmp n, vlen, lsl 3
265 b.ls L(copy_small)
266 cmp tmp, n
267 b.hs L(copy_large)
268
269 /* Align to vector length. */
270 add dst, dstin, n
271 sub tmp, vlen, 1
272 ands tmp, dst, tmp
273 csel tmp, tmp, vlen, ne
274 whilelo p1.b, xzr, tmp
275 sub n, n, tmp
276 ld1b z1.b, p1/z, [src, n]
277 st1b z1.b, p1, [dstin, n]
278 add src, src, n
279 add dst, dstin, n
280
281 ptrue p0.b
282 lsl vlen8, vlen, 3
283 subs n, n, vlen8
284 b.ls 3f
285 sub src, src, vlen8
286 ld1b_unroll8
287 subs n, n, vlen8
288 b.ls 2f
289
290 .p2align 4
291 /* 8x unrolled and software pipelined backward copy loop. */
2921: sub src, src, vlen8
293 sub dst, dst, vlen8
294 stld1b_unroll8
295 subs n, n, vlen8
296 b.hi 1b
2972: sub dst, dst, vlen8
298 st1b_unroll8
2993: add n, n, vlen8
300
301 /* Adjust src/dst for last 0-8 vectors. */
302 sub src, src, n
303 mov dst, dstin
304 b L(last_bytes)
305
306END (__memmove_a64fx)
307#endif /* HAVE_AARCH64_SVE_ASM */
308

source code of glibc/sysdeps/aarch64/multiarch/memcpy_a64fx.S