1/* Optimized version of the standard memmove() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000-2022 Free Software Foundation, Inc.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* Return: dest
20
21 Inputs:
22 in0: dest
23 in1: src
24 in2: byte count
25
26 The core of the function is the memcpy implementation used in memcpy.S.
27 When bytes have to be copied backwards, only the easy case, when
28 all arguments are multiples of 8, is optimised.
29
30 In this form, it assumes little endian mode. For big endian mode,
31 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
32 or the UM.be bit should be cleared at the beginning and set at the end. */
33
34#include <sysdep.h>
35#undef ret
36
37#define OP_T_THRES 16
38#define OPSIZ 8
39
40#define adest r15
41#define saved_pr r17
42#define saved_lc r18
43#define dest r19
44#define src r20
45#define len r21
46#define asrc r22
47#define tmp2 r23
48#define tmp3 r24
49#define tmp4 r25
50#define ptable r26
51#define ploop56 r27
52#define loopaddr r28
53#define sh1 r29
54#define loopcnt r30
55#define value r31
56
57#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
58# define ALIGN(n) { nop 0 }
59#else
60# define ALIGN(n) .align n
61#endif
62
63#define LOOP(shift) \
64 ALIGN(32); \
65.loop##shift##: \
66(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
67(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
68(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
69 nop.b 0 ; \
70 nop.b 0 ; \
71 br.ctop.sptk .loop##shift ; \
72 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
73
74#define MEMLAT 21
75#define Nrot (((2*MEMLAT+3) + 7) & ~7)
76
77ENTRY(memmove)
78 .prologue
79 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
80 .rotr r[MEMLAT + 2], q[MEMLAT + 1]
81 .rotp p[MEMLAT + 2]
82 mov ret0 = in0 // return value = dest
83 .save pr, saved_pr
84 mov saved_pr = pr // save the predicate registers
85 .save ar.lc, saved_lc
86 mov saved_lc = ar.lc // save the loop counter
87 .body
88 or tmp3 = in0, in1 ;; // tmp3 = dest | src
89 or tmp3 = tmp3, in2 // tmp3 = dest | src | len
90 mov dest = in0 // dest
91 mov src = in1 // src
92 mov len = in2 // len
93 sub tmp2 = r0, in0 // tmp2 = -dest
94 cmp.eq p6, p0 = in2, r0 // if (len == 0)
95(p6) br.cond.spnt .restore_and_exit;;// return dest;
96 and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
97 cmp.le p6, p0 = dest, src // if dest <= src it's always safe
98(p6) br.cond.spnt .forward // to copy forward
99 add tmp3 = src, len;;
100 cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
101(p6) br.cond.spnt .backward // we have to copy backward
102
103.forward:
104 shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
105 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
106(p6) br.cond.sptk .next // goto next;
107
108// The optimal case, when dest, src and len are all multiples of 8
109
110 and tmp3 = 0xf, len
111 mov pr.rot = 1 << 16 // set rotating predicates
112 mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
113 cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
114 adds loopcnt = -1, loopcnt;; // --loopcnt
115(p6) ld8 value = [src], 8;;
116(p6) st8 [dest] = value, 8 // copy the "odd" word
117 mov ar.lc = loopcnt // set the loop counter
118 cmp.eq p6, p0 = 8, len
119(p6) br.cond.spnt .restore_and_exit;;// the one-word special case
120 adds adest = 8, dest // set adest one word ahead of dest
121 adds asrc = 8, src ;; // set asrc one word ahead of src
122 nop.b 0 // get the "golden" alignment for
123 nop.b 0 // the next loop
124.l0:
125(p[0]) ld8 r[0] = [src], 16
126(p[0]) ld8 q[0] = [asrc], 16
127(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
128(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
129 br.ctop.dptk .l0 ;;
130
131 mov pr = saved_pr, -1 // restore the predicate registers
132 mov ar.lc = saved_lc // restore the loop counter
133 br.ret.sptk.many b0
134.next:
135 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
136 and loopcnt = 7, tmp2 // loopcnt = -dest % 8
137(p6) br.cond.spnt .cpyfew // copy byte by byte
138 ;;
139 cmp.eq p6, p0 = loopcnt, r0
140(p6) br.cond.sptk .dest_aligned
141 sub len = len, loopcnt // len -= -dest % 8
142 adds loopcnt = -1, loopcnt // --loopcnt
143 ;;
144 mov ar.lc = loopcnt
145.l1: // copy -dest % 8 bytes
146 ld1 value = [src], 1 // value = *src++
147 ;;
148 st1 [dest] = value, 1 // *dest++ = value
149 br.cloop.dptk .l1
150.dest_aligned:
151 and sh1 = 7, src // sh1 = src % 8
152 and tmp2 = -8, len // tmp2 = len & -OPSIZ
153 and asrc = -8, src // asrc = src & -OPSIZ -- align src
154 shr.u loopcnt = len, 3 // loopcnt = len / 8
155 and len = 7, len;; // len = len % 8
156 adds loopcnt = -1, loopcnt // --loopcnt
157 addl tmp4 = @ltoff(.table), gp
158 addl tmp3 = @ltoff(.loop56), gp
159 mov ar.ec = MEMLAT + 1 // set EC
160 mov pr.rot = 1 << 16;; // set rotating predicates
161 mov ar.lc = loopcnt // set LC
162 cmp.eq p6, p0 = sh1, r0 // is the src aligned?
163(p6) br.cond.sptk .src_aligned
164 add src = src, tmp2 // src += len & -OPSIZ
165 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
166 ld8 ploop56 = [tmp3] // ploop56 = &loop56
167 ld8 ptable = [tmp4];; // ptable = &table
168 add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
169 mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
170 ld8 tmp4 = [tmp3];; // tmp4 = loop offset
171 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
172 ld8 r[1] = [asrc], 8;; // w0
173 mov b6 = loopaddr;;
174 br b6 // jump to the appropriate loop
175
176 LOOP(8)
177 LOOP(16)
178 LOOP(24)
179 LOOP(32)
180 LOOP(40)
181 LOOP(48)
182 LOOP(56)
183
184.src_aligned:
185.l3:
186(p[0]) ld8 r[0] = [src], 8
187(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
188 br.ctop.dptk .l3
189.cpyfew:
190 cmp.eq p6, p0 = len, r0 // is len == 0 ?
191 adds len = -1, len // --len;
192(p6) br.cond.spnt .restore_and_exit ;;
193 mov ar.lc = len
194.l4:
195 ld1 value = [src], 1
196 ;;
197 st1 [dest] = value, 1
198 br.cloop.dptk .l4 ;;
199.restore_and_exit:
200 mov pr = saved_pr, -1 // restore the predicate registers
201 mov ar.lc = saved_lc // restore the loop counter
202 br.ret.sptk.many b0
203
204// In the case of a backward copy, optimise only the case when everything
205// is a multiple of 8, otherwise copy byte by byte. The backward copy is
206// used only when the blocks are overlapping and dest > src.
207
208.backward:
209 shr.u loopcnt = len, 3 // loopcnt = len / 8
210 add src = src, len // src points one byte past the end
211 add dest = dest, len ;; // dest points one byte past the end
212 mov ar.ec = MEMLAT + 1 // set the epilog counter
213 mov pr.rot = 1 << 16 // set rotating predicates
214 adds loopcnt = -1, loopcnt // --loopcnt
215 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
216(p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
217 adds src = -8, src // src points to the last word
218 adds dest = -8, dest // dest points to the last word
219 mov ar.lc = loopcnt;; // set the loop counter
220.l5:
221(p[0]) ld8 r[0] = [src], -8
222(p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
223 br.ctop.dptk .l5
224 br.cond.sptk .restore_and_exit
225.bytecopy:
226 adds src = -1, src // src points to the last byte
227 adds dest = -1, dest // dest points to the last byte
228 adds loopcnt = -1, len;; // loopcnt = len - 1
229 mov ar.lc = loopcnt;; // set the loop counter
230.l6:
231(p[0]) ld1 r[0] = [src], -1
232(p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
233 br.ctop.dptk .l6
234 br.cond.sptk .restore_and_exit
235END(memmove)
236
237 .rodata
238 .align 8
239.table:
240 data8 0 // dummy entry
241 data8 .loop56 - .loop8
242 data8 .loop56 - .loop16
243 data8 .loop56 - .loop24
244 data8 .loop56 - .loop32
245 data8 .loop56 - .loop40
246 data8 .loop56 - .loop48
247 data8 .loop56 - .loop56
248
249libc_hidden_builtin_def (memmove)
250

source code of glibc/sysdeps/ia64/memmove.S