1 | /* Optimized memcpy for Fujitsu A64FX processor. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library. If not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | #undef BTI_C |
23 | #define BTI_C |
24 | |
25 | /* Assumptions: |
26 | * |
27 | * ARMv8.2-a, AArch64, unaligned accesses, sve |
28 | * |
29 | */ |
30 | |
31 | #define dstin x0 |
32 | #define src x1 |
33 | #define n x2 |
34 | #define dst x3 |
35 | #define dstend x4 |
36 | #define srcend x5 |
37 | #define tmp x6 |
38 | #define vlen x7 |
39 | #define vlen8 x8 |
40 | |
41 | #if HAVE_AARCH64_SVE_ASM |
42 | |
43 | .arch armv8.2-a+sve |
44 | |
45 | .macro ld1b_unroll8 |
46 | ld1b z0.b, p0/z, [src, 0, mul vl] |
47 | ld1b z1.b, p0/z, [src, 1, mul vl] |
48 | ld1b z2.b, p0/z, [src, 2, mul vl] |
49 | ld1b z3.b, p0/z, [src, 3, mul vl] |
50 | ld1b z4.b, p0/z, [src, 4, mul vl] |
51 | ld1b z5.b, p0/z, [src, 5, mul vl] |
52 | ld1b z6.b, p0/z, [src, 6, mul vl] |
53 | ld1b z7.b, p0/z, [src, 7, mul vl] |
54 | .endm |
55 | |
56 | .macro stld1b_unroll4a |
57 | st1b z0.b, p0, [dst, 0, mul vl] |
58 | st1b z1.b, p0, [dst, 1, mul vl] |
59 | ld1b z0.b, p0/z, [src, 0, mul vl] |
60 | ld1b z1.b, p0/z, [src, 1, mul vl] |
61 | st1b z2.b, p0, [dst, 2, mul vl] |
62 | st1b z3.b, p0, [dst, 3, mul vl] |
63 | ld1b z2.b, p0/z, [src, 2, mul vl] |
64 | ld1b z3.b, p0/z, [src, 3, mul vl] |
65 | .endm |
66 | |
67 | .macro stld1b_unroll4b |
68 | st1b z4.b, p0, [dst, 4, mul vl] |
69 | st1b z5.b, p0, [dst, 5, mul vl] |
70 | ld1b z4.b, p0/z, [src, 4, mul vl] |
71 | ld1b z5.b, p0/z, [src, 5, mul vl] |
72 | st1b z6.b, p0, [dst, 6, mul vl] |
73 | st1b z7.b, p0, [dst, 7, mul vl] |
74 | ld1b z6.b, p0/z, [src, 6, mul vl] |
75 | ld1b z7.b, p0/z, [src, 7, mul vl] |
76 | .endm |
77 | |
78 | .macro stld1b_unroll8 |
79 | stld1b_unroll4a |
80 | stld1b_unroll4b |
81 | .endm |
82 | |
83 | .macro st1b_unroll8 |
84 | st1b z0.b, p0, [dst, 0, mul vl] |
85 | st1b z1.b, p0, [dst, 1, mul vl] |
86 | st1b z2.b, p0, [dst, 2, mul vl] |
87 | st1b z3.b, p0, [dst, 3, mul vl] |
88 | st1b z4.b, p0, [dst, 4, mul vl] |
89 | st1b z5.b, p0, [dst, 5, mul vl] |
90 | st1b z6.b, p0, [dst, 6, mul vl] |
91 | st1b z7.b, p0, [dst, 7, mul vl] |
92 | .endm |
93 | |
94 | #undef BTI_C |
95 | #define BTI_C |
96 | |
97 | ENTRY (__memcpy_a64fx) |
98 | |
99 | PTR_ARG (0) |
100 | PTR_ARG (1) |
101 | SIZE_ARG (2) |
102 | |
103 | cntb vlen |
104 | cmp n, vlen, lsl 1 |
105 | b.hi L(copy_small) |
106 | whilelo p1.b, vlen, n |
107 | whilelo p0.b, xzr, n |
108 | ld1b z0.b, p0/z, [src, 0, mul vl] |
109 | ld1b z1.b, p1/z, [src, 1, mul vl] |
110 | st1b z0.b, p0, [dstin, 0, mul vl] |
111 | st1b z1.b, p1, [dstin, 1, mul vl] |
112 | ret |
113 | |
114 | .p2align 4 |
115 | |
116 | L(copy_small): |
117 | cmp n, vlen, lsl 3 |
118 | b.hi L(copy_large) |
119 | add dstend, dstin, n |
120 | add srcend, src, n |
121 | cmp n, vlen, lsl 2 |
122 | b.hi 1f |
123 | |
124 | /* Copy 2-4 vectors. */ |
125 | ptrue p0.b |
126 | ld1b z0.b, p0/z, [src, 0, mul vl] |
127 | ld1b z1.b, p0/z, [src, 1, mul vl] |
128 | ld1b z2.b, p0/z, [srcend, -2, mul vl] |
129 | ld1b z3.b, p0/z, [srcend, -1, mul vl] |
130 | st1b z0.b, p0, [dstin, 0, mul vl] |
131 | st1b z1.b, p0, [dstin, 1, mul vl] |
132 | st1b z2.b, p0, [dstend, -2, mul vl] |
133 | st1b z3.b, p0, [dstend, -1, mul vl] |
134 | ret |
135 | |
136 | .p2align 4 |
137 | /* Copy 4-8 vectors. */ |
138 | 1: ptrue p0.b |
139 | ld1b z0.b, p0/z, [src, 0, mul vl] |
140 | ld1b z1.b, p0/z, [src, 1, mul vl] |
141 | ld1b z2.b, p0/z, [src, 2, mul vl] |
142 | ld1b z3.b, p0/z, [src, 3, mul vl] |
143 | ld1b z4.b, p0/z, [srcend, -4, mul vl] |
144 | ld1b z5.b, p0/z, [srcend, -3, mul vl] |
145 | ld1b z6.b, p0/z, [srcend, -2, mul vl] |
146 | ld1b z7.b, p0/z, [srcend, -1, mul vl] |
147 | st1b z0.b, p0, [dstin, 0, mul vl] |
148 | st1b z1.b, p0, [dstin, 1, mul vl] |
149 | st1b z2.b, p0, [dstin, 2, mul vl] |
150 | st1b z3.b, p0, [dstin, 3, mul vl] |
151 | st1b z4.b, p0, [dstend, -4, mul vl] |
152 | st1b z5.b, p0, [dstend, -3, mul vl] |
153 | st1b z6.b, p0, [dstend, -2, mul vl] |
154 | st1b z7.b, p0, [dstend, -1, mul vl] |
155 | ret |
156 | |
157 | .p2align 4 |
158 | /* At least 8 vectors - always align to vector length for |
159 | higher and consistent write performance. */ |
160 | L(copy_large): |
161 | sub tmp, vlen, 1 |
162 | and tmp, dstin, tmp |
163 | sub tmp, vlen, tmp |
164 | whilelo p1.b, xzr, tmp |
165 | ld1b z1.b, p1/z, [src] |
166 | st1b z1.b, p1, [dstin] |
167 | add dst, dstin, tmp |
168 | add src, src, tmp |
169 | sub n, n, tmp |
170 | ptrue p0.b |
171 | |
172 | lsl vlen8, vlen, 3 |
173 | subs n, n, vlen8 |
174 | b.ls 3f |
175 | ld1b_unroll8 |
176 | add src, src, vlen8 |
177 | subs n, n, vlen8 |
178 | b.ls 2f |
179 | |
180 | .p2align 4 |
181 | /* 8x unrolled and software pipelined loop. */ |
182 | 1: stld1b_unroll8 |
183 | add dst, dst, vlen8 |
184 | add src, src, vlen8 |
185 | subs n, n, vlen8 |
186 | b.hi 1b |
187 | 2: st1b_unroll8 |
188 | add dst, dst, vlen8 |
189 | 3: add n, n, vlen8 |
190 | |
191 | /* Move last 0-8 vectors. */ |
192 | L(last_bytes): |
193 | cmp n, vlen, lsl 1 |
194 | b.hi 1f |
195 | whilelo p0.b, xzr, n |
196 | whilelo p1.b, vlen, n |
197 | ld1b z0.b, p0/z, [src, 0, mul vl] |
198 | ld1b z1.b, p1/z, [src, 1, mul vl] |
199 | st1b z0.b, p0, [dst, 0, mul vl] |
200 | st1b z1.b, p1, [dst, 1, mul vl] |
201 | ret |
202 | |
203 | .p2align 4 |
204 | |
205 | 1: add srcend, src, n |
206 | add dstend, dst, n |
207 | ld1b z0.b, p0/z, [src, 0, mul vl] |
208 | ld1b z1.b, p0/z, [src, 1, mul vl] |
209 | ld1b z2.b, p0/z, [srcend, -2, mul vl] |
210 | ld1b z3.b, p0/z, [srcend, -1, mul vl] |
211 | cmp n, vlen, lsl 2 |
212 | b.hi 1f |
213 | |
214 | st1b z0.b, p0, [dst, 0, mul vl] |
215 | st1b z1.b, p0, [dst, 1, mul vl] |
216 | st1b z2.b, p0, [dstend, -2, mul vl] |
217 | st1b z3.b, p0, [dstend, -1, mul vl] |
218 | ret |
219 | |
220 | 1: ld1b z4.b, p0/z, [src, 2, mul vl] |
221 | ld1b z5.b, p0/z, [src, 3, mul vl] |
222 | ld1b z6.b, p0/z, [srcend, -4, mul vl] |
223 | ld1b z7.b, p0/z, [srcend, -3, mul vl] |
224 | st1b z0.b, p0, [dst, 0, mul vl] |
225 | st1b z1.b, p0, [dst, 1, mul vl] |
226 | st1b z4.b, p0, [dst, 2, mul vl] |
227 | st1b z5.b, p0, [dst, 3, mul vl] |
228 | st1b z6.b, p0, [dstend, -4, mul vl] |
229 | st1b z7.b, p0, [dstend, -3, mul vl] |
230 | st1b z2.b, p0, [dstend, -2, mul vl] |
231 | st1b z3.b, p0, [dstend, -1, mul vl] |
232 | ret |
233 | |
234 | END (__memcpy_a64fx) |
235 | |
236 | |
237 | ENTRY_ALIGN (__memmove_a64fx, 4) |
238 | |
239 | PTR_ARG (0) |
240 | PTR_ARG (1) |
241 | SIZE_ARG (2) |
242 | |
243 | /* Fast case for up to 2 vectors. */ |
244 | cntb vlen |
245 | cmp n, vlen, lsl 1 |
246 | b.hi 1f |
247 | whilelo p0.b, xzr, n |
248 | whilelo p1.b, vlen, n |
249 | ld1b z0.b, p0/z, [src, 0, mul vl] |
250 | ld1b z1.b, p1/z, [src, 1, mul vl] |
251 | st1b z0.b, p0, [dstin, 0, mul vl] |
252 | st1b z1.b, p1, [dstin, 1, mul vl] |
253 | L(full_overlap): |
254 | ret |
255 | |
256 | .p2align 4 |
257 | /* Check for overlapping moves. Return if there is a full overlap. |
258 | Small moves up to 8 vectors use the overlap-safe copy_small code. |
259 | Non-overlapping or overlapping moves with dst < src use memcpy. |
260 | Overlapping moves with dst > src use a backward copy loop. */ |
261 | 1: sub tmp, dstin, src |
262 | ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */ |
263 | b.eq L(full_overlap) |
264 | cmp n, vlen, lsl 3 |
265 | b.ls L(copy_small) |
266 | cmp tmp, n |
267 | b.hs L(copy_large) |
268 | |
269 | /* Align to vector length. */ |
270 | add dst, dstin, n |
271 | sub tmp, vlen, 1 |
272 | ands tmp, dst, tmp |
273 | csel tmp, tmp, vlen, ne |
274 | whilelo p1.b, xzr, tmp |
275 | sub n, n, tmp |
276 | ld1b z1.b, p1/z, [src, n] |
277 | st1b z1.b, p1, [dstin, n] |
278 | add src, src, n |
279 | add dst, dstin, n |
280 | |
281 | ptrue p0.b |
282 | lsl vlen8, vlen, 3 |
283 | subs n, n, vlen8 |
284 | b.ls 3f |
285 | sub src, src, vlen8 |
286 | ld1b_unroll8 |
287 | subs n, n, vlen8 |
288 | b.ls 2f |
289 | |
290 | .p2align 4 |
291 | /* 8x unrolled and software pipelined backward copy loop. */ |
292 | 1: sub src, src, vlen8 |
293 | sub dst, dst, vlen8 |
294 | stld1b_unroll8 |
295 | subs n, n, vlen8 |
296 | b.hi 1b |
297 | 2: sub dst, dst, vlen8 |
298 | st1b_unroll8 |
299 | 3: add n, n, vlen8 |
300 | |
301 | /* Adjust src/dst for last 0-8 vectors. */ |
302 | sub src, src, n |
303 | mov dst, dstin |
304 | b L(last_bytes) |
305 | |
306 | END (__memmove_a64fx) |
307 | #endif /* HAVE_AARCH64_SVE_ASM */ |
308 | |