1 | /* Optimized memcpy for Qualcomm Falkor processor. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library. If not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | /* Assumptions: |
23 | |
24 | ARMv8-a, AArch64, falkor, unaligned accesses. */ |
25 | |
26 | #define dstin x0 |
27 | #define src x1 |
28 | #define count x2 |
29 | #define dst x3 |
30 | #define srcend x4 |
31 | #define dstend x5 |
32 | #define tmp1 x14 |
33 | #define A_x x6 |
34 | #define B_x x7 |
35 | #define A_w w6 |
36 | #define B_w w7 |
37 | |
38 | #define A_q q0 |
39 | #define B_q q1 |
40 | #define C_q q2 |
41 | #define D_q q3 |
42 | #define E_q q4 |
43 | #define F_q q5 |
44 | #define G_q q6 |
45 | #define H_q q7 |
46 | #define Q_q q6 |
47 | #define S_q q22 |
48 | |
49 | /* Copies are split into 3 main cases: |
50 | |
51 | 1. Small copies of up to 32 bytes |
52 | 2. Medium copies of 33..128 bytes which are fully unrolled |
53 | 3. Large copies of more than 128 bytes. |
54 | |
55 | Large copies align the source to a quad word and use an unrolled loop |
56 | processing 64 bytes per iteration. |
57 | |
58 | FALKOR-SPECIFIC DESIGN: |
59 | |
60 | The smallest copies (32 bytes or less) focus on optimal pipeline usage, |
61 | which is why the redundant copies of 0-3 bytes have been replaced with |
62 | conditionals, since the former would unnecessarily break across multiple |
63 | issue groups. The medium copy group has been enlarged to 128 bytes since |
64 | bumping up the small copies up to 32 bytes allows us to do that without |
65 | cost and also allows us to reduce the size of the prep code before loop64. |
66 | |
67 | The copy loop uses only one register q0. This is to ensure that all loads |
68 | hit a single hardware prefetcher which can get correctly trained to prefetch |
69 | a single stream. |
70 | |
71 | The non-temporal stores help optimize cache utilization. */ |
72 | |
73 | #if IS_IN (libc) |
74 | ENTRY_ALIGN (__memcpy_falkor, 6) |
75 | |
76 | PTR_ARG (0) |
77 | PTR_ARG (1) |
78 | SIZE_ARG (2) |
79 | |
80 | cmp count, 32 |
81 | add srcend, src, count |
82 | add dstend, dstin, count |
83 | b.ls L(copy32) |
84 | cmp count, 128 |
85 | b.hi L(copy_long) |
86 | |
87 | /* Medium copies: 33..128 bytes. */ |
88 | L(copy128): |
89 | sub tmp1, count, 1 |
90 | ldr A_q, [src] |
91 | ldr B_q, [src, 16] |
92 | ldr C_q, [srcend, -32] |
93 | ldr D_q, [srcend, -16] |
94 | tbz tmp1, 6, 1f |
95 | ldr E_q, [src, 32] |
96 | ldr F_q, [src, 48] |
97 | ldr G_q, [srcend, -64] |
98 | ldr H_q, [srcend, -48] |
99 | str G_q, [dstend, -64] |
100 | str H_q, [dstend, -48] |
101 | str E_q, [dstin, 32] |
102 | str F_q, [dstin, 48] |
103 | 1: |
104 | str A_q, [dstin] |
105 | str B_q, [dstin, 16] |
106 | str C_q, [dstend, -32] |
107 | str D_q, [dstend, -16] |
108 | ret |
109 | |
110 | .p2align 4 |
111 | /* Small copies: 0..32 bytes. */ |
112 | L(copy32): |
113 | /* 16-32 */ |
114 | cmp count, 16 |
115 | b.lo 1f |
116 | ldr A_q, [src] |
117 | ldr B_q, [srcend, -16] |
118 | str A_q, [dstin] |
119 | str B_q, [dstend, -16] |
120 | ret |
121 | .p2align 4 |
122 | 1: |
123 | /* 8-15 */ |
124 | tbz count, 3, 1f |
125 | ldr A_x, [src] |
126 | ldr B_x, [srcend, -8] |
127 | str A_x, [dstin] |
128 | str B_x, [dstend, -8] |
129 | ret |
130 | .p2align 4 |
131 | 1: |
132 | /* 4-7 */ |
133 | tbz count, 2, 1f |
134 | ldr A_w, [src] |
135 | ldr B_w, [srcend, -4] |
136 | str A_w, [dstin] |
137 | str B_w, [dstend, -4] |
138 | ret |
139 | .p2align 4 |
140 | 1: |
141 | /* 2-3 */ |
142 | tbz count, 1, 1f |
143 | ldrh A_w, [src] |
144 | ldrh B_w, [srcend, -2] |
145 | strh A_w, [dstin] |
146 | strh B_w, [dstend, -2] |
147 | ret |
148 | .p2align 4 |
149 | 1: |
150 | /* 0-1 */ |
151 | tbz count, 0, 1f |
152 | ldrb A_w, [src] |
153 | strb A_w, [dstin] |
154 | 1: |
155 | ret |
156 | |
157 | /* Align SRC to 16 bytes and copy; that way at least one of the |
158 | accesses is aligned throughout the copy sequence. |
159 | |
160 | The count is off by 0 to 15 bytes, but this is OK because we trim |
161 | off the last 64 bytes to copy off from the end. Due to this the |
162 | loop never runs out of bounds. */ |
163 | |
164 | .p2align 4 |
165 | nop /* Align loop64 below. */ |
166 | L(copy_long): |
167 | ldr A_q, [src] |
168 | sub count, count, 64 + 16 |
169 | and tmp1, src, 15 |
170 | str A_q, [dstin] |
171 | bic src, src, 15 |
172 | sub dst, dstin, tmp1 |
173 | add count, count, tmp1 |
174 | |
175 | L(loop64): |
176 | ldr A_q, [src, 16]! |
177 | str A_q, [dst, 16] |
178 | ldr A_q, [src, 16]! |
179 | subs count, count, 64 |
180 | str A_q, [dst, 32] |
181 | ldr A_q, [src, 16]! |
182 | str A_q, [dst, 48] |
183 | ldr A_q, [src, 16]! |
184 | str A_q, [dst, 64]! |
185 | b.hi L(loop64) |
186 | |
187 | /* Write the last full set of 64 bytes. The remainder is at most 64 |
188 | bytes, so it is safe to always copy 64 bytes from the end even if |
189 | there is just 1 byte left. */ |
190 | ldr E_q, [srcend, -64] |
191 | str E_q, [dstend, -64] |
192 | ldr D_q, [srcend, -48] |
193 | str D_q, [dstend, -48] |
194 | ldr C_q, [srcend, -32] |
195 | str C_q, [dstend, -32] |
196 | ldr B_q, [srcend, -16] |
197 | str B_q, [dstend, -16] |
198 | ret |
199 | |
200 | END (__memcpy_falkor) |
201 | libc_hidden_builtin_def (__memcpy_falkor) |
202 | |
203 | |
204 | /* RATIONALE: |
205 | |
206 | The move has 4 distinct parts: |
207 | * Small moves of 32 bytes and under. |
208 | * Medium sized moves of 33-128 bytes (fully unrolled). |
209 | * Large moves where the source address is higher than the destination |
210 | (forward copies) |
211 | * Large moves where the destination address is higher than the source |
212 | (copy backward, or move). |
213 | |
214 | We use only two registers q6 and q22 for the moves and move 32 bytes at a |
215 | time to correctly train the hardware prefetcher for better throughput. |
216 | |
217 | For small and medium cases memcpy is used. */ |
218 | |
219 | ENTRY_ALIGN (__memmove_falkor, 6) |
220 | |
221 | PTR_ARG (0) |
222 | PTR_ARG (1) |
223 | SIZE_ARG (2) |
224 | |
225 | cmp count, 32 |
226 | add srcend, src, count |
227 | add dstend, dstin, count |
228 | b.ls L(copy32) |
229 | cmp count, 128 |
230 | b.ls L(copy128) |
231 | sub tmp1, dstin, src |
232 | ccmp tmp1, count, 2, hi |
233 | b.lo L(move_long) |
234 | |
235 | /* CASE: Copy Forwards |
236 | |
237 | Align src to 16 byte alignment so that we don't cross cache line |
238 | boundaries on both loads and stores. There are at least 128 bytes |
239 | to copy, so copy 16 bytes unaligned and then align. The loop |
240 | copies 32 bytes per iteration and prefetches one iteration ahead. */ |
241 | |
242 | ldr S_q, [src] |
243 | and tmp1, src, 15 |
244 | bic src, src, 15 |
245 | sub dst, dstin, tmp1 |
246 | add count, count, tmp1 /* Count is now 16 too large. */ |
247 | ldr Q_q, [src, 16]! |
248 | str S_q, [dstin] |
249 | ldr S_q, [src, 16]! |
250 | sub count, count, 32 + 32 + 16 /* Test and readjust count. */ |
251 | |
252 | .p2align 4 |
253 | 1: |
254 | subs count, count, 32 |
255 | str Q_q, [dst, 16] |
256 | ldr Q_q, [src, 16]! |
257 | str S_q, [dst, 32]! |
258 | ldr S_q, [src, 16]! |
259 | b.hi 1b |
260 | |
261 | /* Copy 32 bytes from the end before writing the data prefetched in the |
262 | last loop iteration. */ |
263 | 2: |
264 | ldr B_q, [srcend, -32] |
265 | ldr C_q, [srcend, -16] |
266 | str Q_q, [dst, 16] |
267 | str S_q, [dst, 32] |
268 | str B_q, [dstend, -32] |
269 | str C_q, [dstend, -16] |
270 | ret |
271 | |
272 | /* CASE: Copy Backwards |
273 | |
274 | Align srcend to 16 byte alignment so that we don't cross cache line |
275 | boundaries on both loads and stores. There are at least 128 bytes |
276 | to copy, so copy 16 bytes unaligned and then align. The loop |
277 | copies 32 bytes per iteration and prefetches one iteration ahead. */ |
278 | |
279 | .p2align 4 |
280 | nop |
281 | nop |
282 | L(move_long): |
283 | cbz tmp1, 3f /* Return early if src == dstin */ |
284 | ldr S_q, [srcend, -16] |
285 | and tmp1, srcend, 15 |
286 | sub srcend, srcend, tmp1 |
287 | ldr Q_q, [srcend, -16]! |
288 | str S_q, [dstend, -16] |
289 | sub count, count, tmp1 |
290 | ldr S_q, [srcend, -16]! |
291 | sub dstend, dstend, tmp1 |
292 | sub count, count, 32 + 32 |
293 | |
294 | 1: |
295 | subs count, count, 32 |
296 | str Q_q, [dstend, -16] |
297 | ldr Q_q, [srcend, -16]! |
298 | str S_q, [dstend, -32]! |
299 | ldr S_q, [srcend, -16]! |
300 | b.hi 1b |
301 | |
302 | /* Copy 32 bytes from the start before writing the data prefetched in the |
303 | last loop iteration. */ |
304 | |
305 | ldr B_q, [src, 16] |
306 | ldr C_q, [src] |
307 | str Q_q, [dstend, -16] |
308 | str S_q, [dstend, -32] |
309 | str B_q, [dstin, 16] |
310 | str C_q, [dstin] |
311 | 3: ret |
312 | |
313 | END (__memmove_falkor) |
314 | libc_hidden_builtin_def (__memmove_falkor) |
315 | #endif |
316 | |