memcpy_a64fx.S source code [glibc/sysdeps/aarch64/multiarch/memcpy_a64fx.S]

1	/ Optimized memcpy for Fujitsu A64FX processor.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library. If not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21
22	#undef BTI_C
23	#define BTI_C
24
25	/ Assumptions:*
26	*
27	* ARMv8.2-a, AArch64, unaligned accesses, sve
28	*
29	*/
30
31	#define dstin x0
32	#define src x1
33	#define n x2
34	#define dst x3
35	#define dstend x4
36	#define srcend x5
37	#define tmp x6
38	#define vlen x7
39	#define vlen8 x8
40
41	#if HAVE_AARCH64_SVE_ASM
42
43	.arch armv8`.2`-a+sve
44
45	.macro ld1b_unroll8
46	ld1b z0.b, p0/z, [src, `0`, mul vl]
47	ld1b z1.b, p0/z, [src, `1`, mul vl]
48	ld1b z2.b, p0/z, [src, `2`, mul vl]
49	ld1b z3.b, p0/z, [src, `3`, mul vl]
50	ld1b z4.b, p0/z, [src, `4`, mul vl]
51	ld1b z5.b, p0/z, [src, `5`, mul vl]
52	ld1b z6.b, p0/z, [src, `6`, mul vl]
53	ld1b z7.b, p0/z, [src, `7`, mul vl]
54	.endm
55
56	.macro stld1b_unroll4a
57	st1b z0.b, p0, [dst, `0`, mul vl]
58	st1b z1.b, p0, [dst, `1`, mul vl]
59	ld1b z0.b, p0/z, [src, `0`, mul vl]
60	ld1b z1.b, p0/z, [src, `1`, mul vl]
61	st1b z2.b, p0, [dst, `2`, mul vl]
62	st1b z3.b, p0, [dst, `3`, mul vl]
63	ld1b z2.b, p0/z, [src, `2`, mul vl]
64	ld1b z3.b, p0/z, [src, `3`, mul vl]
65	.endm
66
67	.macro stld1b_unroll4b
68	st1b z4.b, p0, [dst, `4`, mul vl]
69	st1b z5.b, p0, [dst, `5`, mul vl]
70	ld1b z4.b, p0/z, [src, `4`, mul vl]
71	ld1b z5.b, p0/z, [src, `5`, mul vl]
72	st1b z6.b, p0, [dst, `6`, mul vl]
73	st1b z7.b, p0, [dst, `7`, mul vl]
74	ld1b z6.b, p0/z, [src, `6`, mul vl]
75	ld1b z7.b, p0/z, [src, `7`, mul vl]
76	.endm
77
78	.macro stld1b_unroll8
79	stld1b_unroll4a
80	stld1b_unroll4b
81	.endm
82
83	.macro st1b_unroll8
84	st1b z0.b, p0, [dst, `0`, mul vl]
85	st1b z1.b, p0, [dst, `1`, mul vl]
86	st1b z2.b, p0, [dst, `2`, mul vl]
87	st1b z3.b, p0, [dst, `3`, mul vl]
88	st1b z4.b, p0, [dst, `4`, mul vl]
89	st1b z5.b, p0, [dst, `5`, mul vl]
90	st1b z6.b, p0, [dst, `6`, mul vl]
91	st1b z7.b, p0, [dst, `7`, mul vl]
92	.endm
93
94	#undef BTI_C
95	#define BTI_C
96
97	ENTRY (__memcpy_a64fx)
98
99	PTR_ARG (`0`)
100	PTR_ARG (`1`)
101	SIZE_ARG (`2`)
102
103	cntb vlen
104	cmp n, vlen, lsl `1`
105	b.hi L(copy_small)
106	whilelo p1.b, vlen, n
107	whilelo p0.b, xzr, n
108	ld1b z0.b, p0/z, [src, `0`, mul vl]
109	ld1b z1.b, p1/z, [src, `1`, mul vl]
110	st1b z0.b, p0, [dstin, `0`, mul vl]
111	st1b z1.b, p1, [dstin, `1`, mul vl]
112	ret
113
114	.p2align `4`
115
116	L(copy_small):
117	cmp n, vlen, lsl `3`
118	b.hi L(copy_large)
119	add dstend, dstin, n
120	add srcend, src, n
121	cmp n, vlen, lsl `2`
122	b.hi `1f`
123
124	/ Copy 2-4 vectors. /
125	ptrue p0.b
126	ld1b z0.b, p0/z, [src, `0`, mul vl]
127	ld1b z1.b, p0/z, [src, `1`, mul vl]
128	ld1b z2.b, p0/z, [srcend, -`2`, mul vl]
129	ld1b z3.b, p0/z, [srcend, -`1`, mul vl]
130	st1b z0.b, p0, [dstin, `0`, mul vl]
131	st1b z1.b, p0, [dstin, `1`, mul vl]
132	st1b z2.b, p0, [dstend, -`2`, mul vl]
133	st1b z3.b, p0, [dstend, -`1`, mul vl]
134	ret
135
136	.p2align `4`
137	/ Copy 4-8 vectors. /
138	`1`: ptrue p0.b
139	ld1b z0.b, p0/z, [src, `0`, mul vl]
140	ld1b z1.b, p0/z, [src, `1`, mul vl]
141	ld1b z2.b, p0/z, [src, `2`, mul vl]
142	ld1b z3.b, p0/z, [src, `3`, mul vl]
143	ld1b z4.b, p0/z, [srcend, -`4`, mul vl]
144	ld1b z5.b, p0/z, [srcend, -`3`, mul vl]
145	ld1b z6.b, p0/z, [srcend, -`2`, mul vl]
146	ld1b z7.b, p0/z, [srcend, -`1`, mul vl]
147	st1b z0.b, p0, [dstin, `0`, mul vl]
148	st1b z1.b, p0, [dstin, `1`, mul vl]
149	st1b z2.b, p0, [dstin, `2`, mul vl]
150	st1b z3.b, p0, [dstin, `3`, mul vl]
151	st1b z4.b, p0, [dstend, -`4`, mul vl]
152	st1b z5.b, p0, [dstend, -`3`, mul vl]
153	st1b z6.b, p0, [dstend, -`2`, mul vl]
154	st1b z7.b, p0, [dstend, -`1`, mul vl]
155	ret
156
157	.p2align `4`
158	/ At least 8 vectors - always align to vector length for*
159	higher and consistent write performance. /*
160	L(copy_large):
161	sub tmp, vlen, `1`
162	and tmp, dstin, tmp
163	sub tmp, vlen, tmp
164	whilelo p1.b, xzr, tmp
165	ld1b z1.b, p1/z, [src]
166	st1b z1.b, p1, [dstin]
167	add dst, dstin, tmp
168	add src, src, tmp
169	sub n, n, tmp
170	ptrue p0.b
171
172	lsl vlen8, vlen, `3`
173	subs n, n, vlen8
174	b.ls `3f`
175	ld1b_unroll8
176	add src, src, vlen8
177	subs n, n, vlen8
178	b.ls `2f`
179
180	.p2align `4`
181	/ 8x unrolled and software pipelined loop. /
182	`1`: stld1b_unroll8
183	add dst, dst, vlen8
184	add src, src, vlen8
185	subs n, n, vlen8
186	b.hi `1b`
187	`2`: st1b_unroll8
188	add dst, dst, vlen8
189	`3`: add n, n, vlen8
190
191	/ Move last 0-8 vectors. /
192	L(last_bytes):
193	cmp n, vlen, lsl `1`
194	b.hi `1f`
195	whilelo p0.b, xzr, n
196	whilelo p1.b, vlen, n
197	ld1b z0.b, p0/z, [src, `0`, mul vl]
198	ld1b z1.b, p1/z, [src, `1`, mul vl]
199	st1b z0.b, p0, [dst, `0`, mul vl]
200	st1b z1.b, p1, [dst, `1`, mul vl]
201	ret
202
203	.p2align `4`
204
205	`1`: add srcend, src, n
206	add dstend, dst, n
207	ld1b z0.b, p0/z, [src, `0`, mul vl]
208	ld1b z1.b, p0/z, [src, `1`, mul vl]
209	ld1b z2.b, p0/z, [srcend, -`2`, mul vl]
210	ld1b z3.b, p0/z, [srcend, -`1`, mul vl]
211	cmp n, vlen, lsl `2`
212	b.hi `1f`
213
214	st1b z0.b, p0, [dst, `0`, mul vl]
215	st1b z1.b, p0, [dst, `1`, mul vl]
216	st1b z2.b, p0, [dstend, -`2`, mul vl]
217	st1b z3.b, p0, [dstend, -`1`, mul vl]
218	ret
219
220	`1`: ld1b z4.b, p0/z, [src, `2`, mul vl]
221	ld1b z5.b, p0/z, [src, `3`, mul vl]
222	ld1b z6.b, p0/z, [srcend, -`4`, mul vl]
223	ld1b z7.b, p0/z, [srcend, -`3`, mul vl]
224	st1b z0.b, p0, [dst, `0`, mul vl]
225	st1b z1.b, p0, [dst, `1`, mul vl]
226	st1b z4.b, p0, [dst, `2`, mul vl]
227	st1b z5.b, p0, [dst, `3`, mul vl]
228	st1b z6.b, p0, [dstend, -`4`, mul vl]
229	st1b z7.b, p0, [dstend, -`3`, mul vl]
230	st1b z2.b, p0, [dstend, -`2`, mul vl]
231	st1b z3.b, p0, [dstend, -`1`, mul vl]
232	ret
233
234	END (__memcpy_a64fx)
235
236
237	ENTRY_ALIGN (__memmove_a64fx, `4`)
238
239	PTR_ARG (`0`)
240	PTR_ARG (`1`)
241	SIZE_ARG (`2`)
242
243	/ Fast case for up to 2 vectors. /
244	cntb vlen
245	cmp n, vlen, lsl `1`
246	b.hi `1f`
247	whilelo p0.b, xzr, n
248	whilelo p1.b, vlen, n
249	ld1b z0.b, p0/z, [src, `0`, mul vl]
250	ld1b z1.b, p1/z, [src, `1`, mul vl]
251	st1b z0.b, p0, [dstin, `0`, mul vl]
252	st1b z1.b, p1, [dstin, `1`, mul vl]
253	L(full_overlap):
254	ret
255
256	.p2align `4`
257	/ Check for overlapping moves. Return if there is a full overlap.*
258	Small moves up to 8 vectors use the overlap-safe copy_small code.
259	Non-overlapping or overlapping moves with dst < src use memcpy.
260	Overlapping moves with dst > src use a backward copy loop. /*
261	`1`: sub tmp, dstin, src
262	ands tmp, tmp, `0xffffffffffffff` / Clear special tag bits. /
263	b.eq L(full_overlap)
264	cmp n, vlen, lsl `3`
265	b.ls L(copy_small)
266	cmp tmp, n
267	b.hs L(copy_large)
268
269	/ Align to vector length. /
270	add dst, dstin, n
271	sub tmp, vlen, `1`
272	ands tmp, dst, tmp
273	csel tmp, tmp, vlen, ne
274	whilelo p1.b, xzr, tmp
275	sub n, n, tmp
276	ld1b z1.b, p1/z, [src, n]
277	st1b z1.b, p1, [dstin, n]
278	add src, src, n
279	add dst, dstin, n
280
281	ptrue p0.b
282	lsl vlen8, vlen, `3`
283	subs n, n, vlen8
284	b.ls `3f`
285	sub src, src, vlen8
286	ld1b_unroll8
287	subs n, n, vlen8
288	b.ls `2f`
289
290	.p2align `4`
291	/ 8x unrolled and software pipelined backward copy loop. /
292	`1`: sub src, src, vlen8
293	sub dst, dst, vlen8
294	stld1b_unroll8
295	subs n, n, vlen8
296	b.hi `1b`
297	`2`: sub dst, dst, vlen8
298	st1b_unroll8
299	`3`: add n, n, vlen8
300
301	/ Adjust src/dst for last 0-8 vectors. /
302	sub src, src, n
303	mov dst, dstin
304	b L(last_bytes)
305
306	END (__memmove_a64fx)
307	#endif /* HAVE_AARCH64_SVE_ASM */
308

source code of glibc/sysdeps/aarch64/multiarch/memcpy_a64fx.S