1 | /* Optimized version of the standard memcpy() function. |
2 | This file is part of the GNU C Library. |
3 | Copyright (C) 2000-2022 Free Software Foundation, Inc. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* Return: dest |
20 | |
21 | Inputs: |
22 | in0: dest |
23 | in1: src |
24 | in2: byte count |
25 | |
26 | An assembly implementation of the algorithm used by the generic C |
27 | version from glibc. The case when source and sest are aligned is |
28 | treated separately, for extra performance. |
29 | |
30 | In this form, memcpy assumes little endian mode. For big endian mode, |
31 | sh1 must be computed using an extra instruction: sub sh1 = 64, sh1 |
32 | and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the |
33 | shrp instruction. */ |
34 | |
35 | #define USE_LFETCH |
36 | #define USE_FLP |
37 | #include <sysdep.h> |
38 | #undef ret |
39 | |
40 | #define LFETCH_DIST 500 |
41 | |
42 | #define ALIGN_UNROLL_no 4 // no. of elements |
43 | #define ALIGN_UNROLL_sh 2 // (shift amount) |
44 | |
45 | #define MEMLAT 8 |
46 | #define Nrot ((4*(MEMLAT+2) + 7) & ~7) |
47 | |
48 | #define OP_T_THRES 16 |
49 | #define OPSIZ 8 |
50 | |
51 | #define loopcnt r14 |
52 | #define elemcnt r15 |
53 | #define saved_pr r16 |
54 | #define saved_lc r17 |
55 | #define adest r18 |
56 | #define dest r19 |
57 | #define asrc r20 |
58 | #define src r21 |
59 | #define len r22 |
60 | #define tmp2 r23 |
61 | #define tmp3 r24 |
62 | #define tmp4 r25 |
63 | #define ptable r26 |
64 | #define ploop56 r27 |
65 | #define loopaddr r28 |
66 | #define sh1 r29 |
67 | #define ptr1 r30 |
68 | #define ptr2 r31 |
69 | |
70 | #define movi0 mov |
71 | |
72 | #define p_scr p6 |
73 | #define p_xtr p7 |
74 | #define p_nxtr p8 |
75 | #define p_few p9 |
76 | |
77 | #if defined(USE_FLP) |
78 | #define load ldf8 |
79 | #define store stf8 |
80 | #define tempreg f6 |
81 | #define the_r fr |
82 | #define the_s fs |
83 | #define the_t ft |
84 | #define the_q fq |
85 | #define the_w fw |
86 | #define the_x fx |
87 | #define the_y fy |
88 | #define the_z fz |
89 | #elif defined(USE_INT) |
90 | #define load ld8 |
91 | #define store st8 |
92 | #define tempreg tmp2 |
93 | #define the_r r |
94 | #define the_s s |
95 | #define the_t t |
96 | #define the_q q |
97 | #define the_w w |
98 | #define the_x x |
99 | #define the_y y |
100 | #define the_z z |
101 | #endif |
102 | |
103 | #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO |
104 | /* Manually force proper loop-alignment. Note: be sure to |
105 | double-check the code-layout after making any changes to |
106 | this routine! */ |
107 | # define ALIGN(n) { nop 0 } |
108 | #else |
109 | # define ALIGN(n) .align n |
110 | #endif |
111 | |
112 | #if defined(USE_LFETCH) |
113 | #define LOOP(shift) \ |
114 | ALIGN(32); \ |
115 | .loop##shift##: \ |
116 | { .mmb \ |
117 | (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ |
118 | (p[0]) lfetch.nt1 [ptr1], 16 ; \ |
119 | nop.b 0 ; \ |
120 | } { .mib \ |
121 | (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ |
122 | (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ |
123 | nop.b 0 ;; \ |
124 | } { .mmb \ |
125 | (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ |
126 | (p[0]) lfetch.nt1 [ptr2], 16 ; \ |
127 | nop.b 0 ; \ |
128 | } { .mib \ |
129 | (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ |
130 | (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ |
131 | br.ctop.sptk.many .loop##shift \ |
132 | ;; } \ |
133 | { .mib \ |
134 | br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ |
135 | } |
136 | #else |
137 | #define LOOP(shift) \ |
138 | ALIGN(32); \ |
139 | .loop##shift##: \ |
140 | { .mmb \ |
141 | (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ |
142 | nop.b 0 ; \ |
143 | } { .mib \ |
144 | (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ |
145 | (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ |
146 | nop.b 0 ;; \ |
147 | } { .mmb \ |
148 | (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ |
149 | nop.b 0 ; \ |
150 | } { .mib \ |
151 | (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ |
152 | (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ |
153 | br.ctop.sptk.many .loop##shift \ |
154 | ;; } \ |
155 | { .mib \ |
156 | br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ |
157 | } |
158 | #endif |
159 | |
160 | |
161 | ENTRY(memcpy) |
162 | { .mmi |
163 | .prologue |
164 | alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot |
165 | .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1] |
166 | .rotp p[MEMLAT+2] |
167 | .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1] |
168 | mov ret0 = in0 // return tmp2 = dest |
169 | .save pr, saved_pr |
170 | movi0 saved_pr = pr // save the predicate registers |
171 | } { .mmi |
172 | and tmp4 = 7, in0 // check if destination is aligned |
173 | mov dest = in0 // dest |
174 | mov src = in1 // src |
175 | ;; } |
176 | { .mii |
177 | cmp.eq p_scr, p0 = in2, r0 // if (len == 0) |
178 | .save ar.lc, saved_lc |
179 | movi0 saved_lc = ar.lc // save the loop counter |
180 | .body |
181 | cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH |
182 | } { .mbb |
183 | mov len = in2 // len |
184 | (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest |
185 | (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte |
186 | ;; } |
187 | { .mmi |
188 | #if defined(USE_LFETCH) |
189 | lfetch.nt1 [dest] // |
190 | lfetch.nt1 [src] // |
191 | #endif |
192 | shr.u elemcnt = len, 3 // elemcnt = len / 8 |
193 | } { .mib |
194 | cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned? |
195 | sub loopcnt = 7, tmp4 // |
196 | (p_scr) br.cond.dptk.many .dest_aligned |
197 | ;; } |
198 | { .mmi |
199 | ld1 tmp2 = [src], 1 // |
200 | sub len = len, loopcnt, 1 // reduce len |
201 | movi0 ar.lc = loopcnt // |
202 | } { .mib |
203 | cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point |
204 | ;; } |
205 | |
206 | .l0: // ---------------------------- // L0: Align src on 8-byte boundary |
207 | { .mmi |
208 | st1 [dest] = tmp2, 1 // |
209 | (p_scr) ld1 tmp2 = [src], 1 // |
210 | } { .mib |
211 | cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point |
212 | add loopcnt = -1, loopcnt |
213 | br.cloop.dptk.few .l0 // |
214 | ;; } |
215 | |
216 | .dest_aligned: |
217 | { .mmi |
218 | and tmp4 = 7, src // ready for alignment check |
219 | shr.u elemcnt = len, 3 // elemcnt = len / 8 |
220 | ;; } |
221 | { .mib |
222 | cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned |
223 | tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src |
224 | } { .mib // is not 16B aligned |
225 | add ptr2 = LFETCH_DIST, dest // prefetch address |
226 | add ptr1 = LFETCH_DIST, src |
227 | (p_scr) br.cond.dptk.many .src_not_aligned |
228 | ;; } |
229 | |
230 | // The optimal case, when dest, and src are aligned |
231 | |
232 | .both_aligned: |
233 | { .mmi |
234 | .pred.rel "mutex" ,p_xtr,p_nxtr |
235 | (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify |
236 | (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify |
237 | movi0 pr.rot = 1 << 16 // set rotating predicates |
238 | } { .mib |
239 | (p_scr) br.cond.dpnt.many .copy_full_words |
240 | ;; } |
241 | |
242 | { .mmi |
243 | (p_xtr) load tempreg = [src], 8 |
244 | (p_xtr) add elemcnt = -1, elemcnt |
245 | movi0 ar.ec = MEMLAT + 1 // set the epilog counter |
246 | ;; } |
247 | { .mmi |
248 | (p_xtr) add len = -8, len // |
249 | add asrc = 16, src // one bank apart (for USE_INT) |
250 | shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling |
251 | ;;} |
252 | { .mmi |
253 | add loopcnt = -1, loopcnt |
254 | (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word |
255 | nop.i 0 |
256 | ;; } |
257 | { .mib |
258 | add adest = 16, dest |
259 | movi0 ar.lc = loopcnt // set the loop counter |
260 | ;; } |
261 | |
262 | #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO |
263 | { nop 0 } |
264 | #else |
265 | .align 32 |
266 | #endif |
267 | #if defined(USE_FLP) |
268 | .l1: // ------------------------------- // L1: Everything a multiple of 8 |
269 | { .mmi |
270 | #if defined(USE_LFETCH) |
271 | (p[0]) lfetch.nt1 [ptr2],32 |
272 | #endif |
273 | (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16 |
274 | (p[0]) add len = -32, len |
275 | } {.mmb |
276 | (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 |
277 | (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8 |
278 | ;; } |
279 | { .mmi |
280 | #if defined(USE_LFETCH) |
281 | (p[0]) lfetch.nt1 [ptr1],32 |
282 | #endif |
283 | (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16 |
284 | } {.mmb |
285 | (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24 |
286 | (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 |
287 | br.ctop.dptk.many .l1 |
288 | ;; } |
289 | #elif defined(USE_INT) |
290 | .l1: // ------------------------------- // L1: Everything a multiple of 8 |
291 | { .mmi |
292 | (p[0]) load the_r[0] = [src], 8 |
293 | (p[0]) load the_q[0] = [asrc], 8 |
294 | (p[0]) add len = -32, len |
295 | } {.mmb |
296 | (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 |
297 | (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8 |
298 | ;; } |
299 | { .mmi |
300 | (p[0]) load the_s[0] = [src], 24 |
301 | (p[0]) load the_t[0] = [asrc], 24 |
302 | } {.mmb |
303 | (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24 |
304 | (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 |
305 | #if defined(USE_LFETCH) |
306 | ;; } |
307 | { .mmb |
308 | (p[0]) lfetch.nt1 [ptr2],32 |
309 | (p[0]) lfetch.nt1 [ptr1],32 |
310 | #endif |
311 | br.ctop.dptk.many .l1 |
312 | ;; } |
313 | #endif |
314 | |
315 | .copy_full_words: |
316 | { .mib |
317 | cmp.gt p_scr, p0 = 8, len // |
318 | shr.u elemcnt = len, 3 // |
319 | (p_scr) br.cond.dpnt.many .copy_bytes |
320 | ;; } |
321 | { .mii |
322 | load tempreg = [src], 8 |
323 | add loopcnt = -1, elemcnt // |
324 | ;; } |
325 | { .mii |
326 | cmp.ne p_scr, p0 = 0, loopcnt // |
327 | mov ar.lc = loopcnt // |
328 | ;; } |
329 | |
330 | .l2: // ------------------------------- // L2: Max 4 words copied separately |
331 | { .mmi |
332 | store [dest] = tempreg, 8 |
333 | (p_scr) load tempreg = [src], 8 // |
334 | add len = -8, len |
335 | } { .mib |
336 | cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point |
337 | add loopcnt = -1, loopcnt |
338 | br.cloop.dptk.few .l2 |
339 | ;; } |
340 | |
341 | .copy_bytes: |
342 | { .mib |
343 | cmp.eq p_scr, p0 = len, r0 // is len == 0 ? |
344 | add loopcnt = -1, len // len--; |
345 | (p_scr) br.cond.spnt .restore_and_exit |
346 | ;; } |
347 | { .mii |
348 | ld1 tmp2 = [src], 1 |
349 | movi0 ar.lc = loopcnt |
350 | cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point |
351 | ;; } |
352 | |
353 | .l3: // ------------------------------- // L3: Final byte move |
354 | { .mmi |
355 | st1 [dest] = tmp2, 1 |
356 | (p_scr) ld1 tmp2 = [src], 1 |
357 | } { .mib |
358 | cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point |
359 | add loopcnt = -1, loopcnt |
360 | br.cloop.dptk.few .l3 |
361 | ;; } |
362 | |
363 | .restore_and_exit: |
364 | { .mmi |
365 | movi0 pr = saved_pr, -1 // restore the predicate registers |
366 | ;; } |
367 | { .mib |
368 | movi0 ar.lc = saved_lc // restore the loop counter |
369 | br.ret.sptk.many b0 |
370 | ;; } |
371 | |
372 | |
373 | .src_not_aligned: |
374 | { .mmi |
375 | cmp.gt p_scr, p0 = 16, len |
376 | and sh1 = 7, src // sh1 = src % 8 |
377 | shr.u loopcnt = len, 4 // element-cnt = len / 16 |
378 | } { .mib |
379 | add tmp4 = @ltoff(.table), gp |
380 | add tmp3 = @ltoff(.loop56), gp |
381 | (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few |
382 | ;; } |
383 | { .mmi |
384 | and asrc = -8, src // asrc = (-8) -- align src for loop |
385 | add loopcnt = -1, loopcnt // loopcnt-- |
386 | shl sh1 = sh1, 3 // sh1 = 8 * (src % 8) |
387 | } { .mmi |
388 | ld8 ptable = [tmp4] // ptable = &table |
389 | ld8 ploop56 = [tmp3] // ploop56 = &loop56 |
390 | and tmp2 = -16, len // tmp2 = len & -OPSIZ |
391 | ;; } |
392 | { .mmi |
393 | add tmp3 = ptable, sh1 // tmp3 = &table + sh1 |
394 | add src = src, tmp2 // src += len & (-16) |
395 | movi0 ar.lc = loopcnt // set LC |
396 | ;; } |
397 | { .mmi |
398 | ld8 tmp4 = [tmp3] // tmp4 = loop offset |
399 | sub len = len, tmp2 // len -= len & (-16) |
400 | movi0 ar.ec = MEMLAT + 2 // one more pass needed |
401 | ;; } |
402 | { .mmi |
403 | ld8 s[1] = [asrc], 8 // preload |
404 | sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset |
405 | movi0 pr.rot = 1 << 16 // set rotating predicates |
406 | ;; } |
407 | { .mib |
408 | nop.m 0 |
409 | movi0 b6 = loopaddr |
410 | br b6 // jump to the appropriate loop |
411 | ;; } |
412 | |
413 | LOOP(8) |
414 | LOOP(16) |
415 | LOOP(24) |
416 | LOOP(32) |
417 | LOOP(40) |
418 | LOOP(48) |
419 | LOOP(56) |
420 | END(memcpy) |
421 | libc_hidden_builtin_def (memcpy) |
422 | |
423 | .rodata |
424 | .align 8 |
425 | .table: |
426 | data8 0 // dummy entry |
427 | data8 .loop56 - .loop8 |
428 | data8 .loop56 - .loop16 |
429 | data8 .loop56 - .loop24 |
430 | data8 .loop56 - .loop32 |
431 | data8 .loop56 - .loop40 |
432 | data8 .loop56 - .loop48 |
433 | data8 .loop56 - .loop56 |
434 | |