memcpy_falkor.S source code [glibc/sysdeps/aarch64/multiarch/memcpy_falkor.S]

1	/ Optimized memcpy for Qualcomm Falkor processor.*
2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
3
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library. If not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21
22	/ Assumptions:*
23
24	ARMv8-a, AArch64, falkor, unaligned accesses. /*
25
26	#define dstin x0
27	#define src x1
28	#define count x2
29	#define dst x3
30	#define srcend x4
31	#define dstend x5
32	#define tmp1 x14
33	#define A_x x6
34	#define B_x x7
35	#define A_w w6
36	#define B_w w7
37
38	#define A_q q0
39	#define B_q q1
40	#define C_q q2
41	#define D_q q3
42	#define E_q q4
43	#define F_q q5
44	#define G_q q6
45	#define H_q q7
46	#define Q_q q6
47	#define S_q q22
48
49	/ Copies are split into 3 main cases:*
50
51	1. Small copies of up to 32 bytes
52	2. Medium copies of 33..128 bytes which are fully unrolled
53	3. Large copies of more than 128 bytes.
54
55	Large copies align the source to a quad word and use an unrolled loop
56	processing 64 bytes per iteration.
57
58	FALKOR-SPECIFIC DESIGN:
59
60	The smallest copies (32 bytes or less) focus on optimal pipeline usage,
61	which is why the redundant copies of 0-3 bytes have been replaced with
62	conditionals, since the former would unnecessarily break across multiple
63	issue groups. The medium copy group has been enlarged to 128 bytes since
64	bumping up the small copies up to 32 bytes allows us to do that without
65	cost and also allows us to reduce the size of the prep code before loop64.
66
67	The copy loop uses only one register q0. This is to ensure that all loads
68	hit a single hardware prefetcher which can get correctly trained to prefetch
69	a single stream.
70
71	The non-temporal stores help optimize cache utilization. /*
72
73	#if IS_IN (libc)
74	ENTRY_ALIGN (__memcpy_falkor, `6`)
75
76	PTR_ARG (`0`)
77	PTR_ARG (`1`)
78	SIZE_ARG (`2`)
79
80	cmp count, `32`
81	add srcend, src, count
82	add dstend, dstin, count
83	b.ls L(copy32)
84	cmp count, `128`
85	b.hi L(copy_long)
86
87	/ Medium copies: 33..128 bytes. /
88	L(copy128):
89	sub tmp1, count, `1`
90	ldr A_q, [src]
91	ldr B_q, [src, `16`]
92	ldr C_q, [srcend, -`32`]
93	ldr D_q, [srcend, -`16`]
94	tbz tmp1, `6`, `1f`
95	ldr E_q, [src, `32`]
96	ldr F_q, [src, `48`]
97	ldr G_q, [srcend, -`64`]
98	ldr H_q, [srcend, -`48`]
99	str G_q, [dstend, -`64`]
100	str H_q, [dstend, -`48`]
101	str E_q, [dstin, `32`]
102	str F_q, [dstin, `48`]
103	`1`:
104	str A_q, [dstin]
105	str B_q, [dstin, `16`]
106	str C_q, [dstend, -`32`]
107	str D_q, [dstend, -`16`]
108	ret
109
110	.p2align `4`
111	/ Small copies: 0..32 bytes. /
112	L(copy32):
113	/ 16-32 /
114	cmp count, `16`
115	b.lo `1f`
116	ldr A_q, [src]
117	ldr B_q, [srcend, -`16`]
118	str A_q, [dstin]
119	str B_q, [dstend, -`16`]
120	ret
121	.p2align `4`
122	`1`:
123	/ 8-15 /
124	tbz count, `3`, `1f`
125	ldr A_x, [src]
126	ldr B_x, [srcend, -`8`]
127	str A_x, [dstin]
128	str B_x, [dstend, -`8`]
129	ret
130	.p2align `4`
131	`1`:
132	/ 4-7 /
133	tbz count, `2`, `1f`
134	ldr A_w, [src]
135	ldr B_w, [srcend, -`4`]
136	str A_w, [dstin]
137	str B_w, [dstend, -`4`]
138	ret
139	.p2align `4`
140	`1`:
141	/ 2-3 /
142	tbz count, `1`, `1f`
143	ldrh A_w, [src]
144	ldrh B_w, [srcend, -`2`]
145	strh A_w, [dstin]
146	strh B_w, [dstend, -`2`]
147	ret
148	.p2align `4`
149	`1`:
150	/ 0-1 /
151	tbz count, `0`, `1f`
152	ldrb A_w, [src]
153	strb A_w, [dstin]
154	`1`:
155	ret
156
157	/ Align SRC to 16 bytes and copy; that way at least one of the*
158	accesses is aligned throughout the copy sequence.
159
160	The count is off by 0 to 15 bytes, but this is OK because we trim
161	off the last 64 bytes to copy off from the end. Due to this the
162	loop never runs out of bounds. /*
163
164	.p2align `4`
165	nop / Align loop64 below. /
166	L(copy_long):
167	ldr A_q, [src]
168	sub count, count, `64` + `16`
169	and tmp1, src, `15`
170	str A_q, [dstin]
171	bic src, src, `15`
172	sub dst, dstin, tmp1
173	add count, count, tmp1
174
175	L(loop64):
176	ldr A_q, [src, `16`]!
177	str A_q, [dst, `16`]
178	ldr A_q, [src, `16`]!
179	subs count, count, `64`
180	str A_q, [dst, `32`]
181	ldr A_q, [src, `16`]!
182	str A_q, [dst, `48`]
183	ldr A_q, [src, `16`]!
184	str A_q, [dst, `64`]!
185	b.hi L(loop64)
186
187	/ Write the last full set of 64 bytes. The remainder is at most 64*
188	bytes, so it is safe to always copy 64 bytes from the end even if
189	there is just 1 byte left. /*
190	ldr E_q, [srcend, -`64`]
191	str E_q, [dstend, -`64`]
192	ldr D_q, [srcend, -`48`]
193	str D_q, [dstend, -`48`]
194	ldr C_q, [srcend, -`32`]
195	str C_q, [dstend, -`32`]
196	ldr B_q, [srcend, -`16`]
197	str B_q, [dstend, -`16`]
198	ret
199
200	END (__memcpy_falkor)
201	libc_hidden_builtin_def (__memcpy_falkor)
202
203
204	/ RATIONALE:*
205
206	The move has 4 distinct parts:
207	* Small moves of 32 bytes and under.
208	* Medium sized moves of 33-128 bytes (fully unrolled).
209	* Large moves where the source address is higher than the destination
210	(forward copies)
211	* Large moves where the destination address is higher than the source
212	(copy backward, or move).
213
214	We use only two registers q6 and q22 for the moves and move 32 bytes at a
215	time to correctly train the hardware prefetcher for better throughput.
216
217	For small and medium cases memcpy is used. /*
218
219	ENTRY_ALIGN (__memmove_falkor, `6`)
220
221	PTR_ARG (`0`)
222	PTR_ARG (`1`)
223	SIZE_ARG (`2`)
224
225	cmp count, `32`
226	add srcend, src, count
227	add dstend, dstin, count
228	b.ls L(copy32)
229	cmp count, `128`
230	b.ls L(copy128)
231	sub tmp1, dstin, src
232	ccmp tmp1, count, `2`, hi
233	b.lo L(move_long)
234
235	/ CASE: Copy Forwards*
236
237	Align src to 16 byte alignment so that we don't cross cache line
238	boundaries on both loads and stores. There are at least 128 bytes
239	to copy, so copy 16 bytes unaligned and then align. The loop
240	copies 32 bytes per iteration and prefetches one iteration ahead. /*
241
242	ldr S_q, [src]
243	and tmp1, src, `15`
244	bic src, src, `15`
245	sub dst, dstin, tmp1
246	add count, count, tmp1 / Count is now 16 too large. /
247	ldr Q_q, [src, `16`]!
248	str S_q, [dstin]
249	ldr S_q, [src, `16`]!
250	sub count, count, `32` + `32` + `16` / Test and readjust count. /
251
252	.p2align `4`
253	`1`:
254	subs count, count, `32`
255	str Q_q, [dst, `16`]
256	ldr Q_q, [src, `16`]!
257	str S_q, [dst, `32`]!
258	ldr S_q, [src, `16`]!
259	b.hi `1b`
260
261	/ Copy 32 bytes from the end before writing the data prefetched in the*
262	last loop iteration. /*
263	`2`:
264	ldr B_q, [srcend, -`32`]
265	ldr C_q, [srcend, -`16`]
266	str Q_q, [dst, `16`]
267	str S_q, [dst, `32`]
268	str B_q, [dstend, -`32`]
269	str C_q, [dstend, -`16`]
270	ret
271
272	/ CASE: Copy Backwards*
273
274	Align srcend to 16 byte alignment so that we don't cross cache line
275	boundaries on both loads and stores. There are at least 128 bytes
276	to copy, so copy 16 bytes unaligned and then align. The loop
277	copies 32 bytes per iteration and prefetches one iteration ahead. /*
278
279	.p2align `4`
280	nop
281	nop
282	L(move_long):
283	cbz tmp1, `3f` / Return early if src == dstin /
284	ldr S_q, [srcend, -`16`]
285	and tmp1, srcend, `15`
286	sub srcend, srcend, tmp1
287	ldr Q_q, [srcend, -`16`]!
288	str S_q, [dstend, -`16`]
289	sub count, count, tmp1
290	ldr S_q, [srcend, -`16`]!
291	sub dstend, dstend, tmp1
292	sub count, count, `32` + `32`
293
294	`1`:
295	subs count, count, `32`
296	str Q_q, [dstend, -`16`]
297	ldr Q_q, [srcend, -`16`]!
298	str S_q, [dstend, -`32`]!
299	ldr S_q, [srcend, -`16`]!
300	b.hi `1b`
301
302	/ Copy 32 bytes from the start before writing the data prefetched in the*
303	last loop iteration. /*
304
305	ldr B_q, [src, `16`]
306	ldr C_q, [src]
307	str Q_q, [dstend, -`16`]
308	str S_q, [dstend, -`32`]
309	str B_q, [dstin, `16`]
310	str C_q, [dstin]
311	`3`: ret
312
313	END (__memmove_falkor)
314	libc_hidden_builtin_def (__memmove_falkor)
315	#endif
316

source code of glibc/sysdeps/aarch64/multiarch/memcpy_falkor.S