memmove.S source code [glibc/sysdeps/ia64/memmove.S]

1	/ Optimized version of the standard memmove() function.*
2	This file is part of the GNU C Library.
3	Copyright (C) 2000-2022 Free Software Foundation, Inc.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ Return: dest*
20
21	Inputs:
22	in0: dest
23	in1: src
24	in2: byte count
25
26	The core of the function is the memcpy implementation used in memcpy.S.
27	When bytes have to be copied backwards, only the easy case, when
28	all arguments are multiples of 8, is optimised.
29
30	In this form, it assumes little endian mode. For big endian mode,
31	sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
32	or the UM.be bit should be cleared at the beginning and set at the end. /*
33
34	#include <sysdep.h>
35	#undef ret
36
37	#define OP_T_THRES 16
38	#define OPSIZ 8
39
40	#define adest r15
41	#define saved_pr r17
42	#define saved_lc r18
43	#define dest r19
44	#define src r20
45	#define len r21
46	#define asrc r22
47	#define tmp2 r23
48	#define tmp3 r24
49	#define tmp4 r25
50	#define ptable r26
51	#define ploop56 r27
52	#define loopaddr r28
53	#define sh1 r29
54	#define loopcnt r30
55	#define value r31
56
57	#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
58	# define ALIGN(n) { nop 0 }
59	#else
60	# define ALIGN(n) .align n
61	#endif
62
63	#define LOOP(shift) \
64	ALIGN(32); \
65	.loop##shift##: \
66	(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
67	(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
68	(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
69	nop.b 0 ; \
70	nop.b 0 ; \
71	br.ctop.sptk .loop##shift ; \
72	br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
73
74	#define MEMLAT 21
75	#define Nrot (((2*MEMLAT+3) + 7) & ~7)
76
77	ENTRY(memmove)
78	.prologue
79	alloc r2 = ar.pfs, `3`, Nrot - `3`, `0`, Nrot
80	.rotr r[MEMLAT + `2`], q[MEMLAT + `1`]
81	.rotp p[MEMLAT + `2`]
82	mov ret0 = in0 // return value = dest
83	.save pr, saved_pr
84	mov saved_pr = pr // save the predicate registers
85	.save ar.lc, saved_lc
86	mov saved_lc = ar.lc // save the loop counter
87	.body
88	or tmp3 = in0, in1 ;; // tmp3 = dest \| src
89	or tmp3 = tmp3, in2 // tmp3 = dest \| src \| len
90	mov dest = in0 // dest
91	mov src = in1 // src
92	mov len = in2 // len
93	sub tmp2 = r0, in0 // tmp2 = -dest
94	cmp.eq p6, p0 = in2, r0 // if (len == 0)
95	(p6) br.cond.spnt .restore_and_exit;;// return dest;
96	and tmp4 = `7`, tmp3 // tmp4 = (dest \| src \| len) & 7
97	cmp.le p6, p0 = dest, src // if dest <= src it's always safe
98	(p6) br.cond.spnt .forward // to copy forward
99	add tmp3 = src, len;;
100	cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
101	(p6) br.cond.spnt .backward // we have to copy backward
102
103	.forward:
104	shr.u loopcnt = len, `4` ;; // loopcnt = len / 16
105	cmp.ne p6, p0 = tmp4, r0 // if ((dest \| src \| len) & 7 != 0)
106	(p6) br.cond.sptk .next // goto next;
107
108	// The optimal case, when dest, src and len are all multiples of 8
109
110	and tmp3 = `0xf`, len
111	mov pr.rot = `1` << `16` // set rotating predicates
112	mov ar.ec = MEMLAT + `1` ;; // set the epilog counter
113	cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
114	adds loopcnt = -`1`, loopcnt;; // --loopcnt
115	(p6) ld8 value = [src], `8`;;
116	(p6) st8 [dest] = value, `8` // copy the "odd" word
117	mov ar.lc = loopcnt // set the loop counter
118	cmp.eq p6, p0 = `8`, len
119	(p6) br.cond.spnt .restore_and_exit;;// the one-word special case
120	adds adest = `8`, dest // set adest one word ahead of dest
121	adds asrc = `8`, src ;; // set asrc one word ahead of src
122	nop.b `0` // get the "golden" alignment for
123	nop.b `0` // the next loop
124	.l0:
125	(p[`0`]) ld8 r[`0`] = [src], `16`
126	(p[`0`]) ld8 q[`0`] = [asrc], `16`
127	(p[MEMLAT]) st8 [dest] = r[MEMLAT], `16`
128	(p[MEMLAT]) st8 [adest] = q[MEMLAT], `16`
129	br.ctop.dptk .l0 ;;
130
131	mov pr = saved_pr, -`1` // restore the predicate registers
132	mov ar.lc = saved_lc // restore the loop counter
133	br.ret.sptk.many b0
134	.next:
135	cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
136	and loopcnt = `7`, tmp2 // loopcnt = -dest % 8
137	(p6) br.cond.spnt .cpyfew // copy byte by byte
138	;;
139	cmp.eq p6, p0 = loopcnt, r0
140	(p6) br.cond.sptk .dest_aligned
141	sub len = len, loopcnt // len -= -dest % 8
142	adds loopcnt = -`1`, loopcnt // --loopcnt
143	;;
144	mov ar.lc = loopcnt
145	.l1: // copy -dest % 8 bytes
146	ld1 value = [src], `1` // value = src++*
147	;;
148	st1 [dest] = value, `1` // dest++ = value*
149	br.cloop.dptk .l1
150	.dest_aligned:
151	and sh1 = `7`, src // sh1 = src % 8
152	and tmp2 = -`8`, len // tmp2 = len & -OPSIZ
153	and asrc = -`8`, src // asrc = src & -OPSIZ -- align src
154	shr.u loopcnt = len, `3` // loopcnt = len / 8
155	and len = `7`, len;; // len = len % 8
156	adds loopcnt = -`1`, loopcnt // --loopcnt
157	addl tmp4 = @ltoff(.table), gp
158	addl tmp3 = @ltoff(.loop56), gp
159	mov ar.ec = MEMLAT + `1` // set EC
160	mov pr.rot = `1` << `16`;; // set rotating predicates
161	mov ar.lc = loopcnt // set LC
162	cmp.eq p6, p0 = sh1, r0 // is the src aligned?
163	(p6) br.cond.sptk .src_aligned
164	add src = src, tmp2 // src += len & -OPSIZ
165	shl sh1 = sh1, `3` // sh1 = 8 (src % 8)*
166	ld8 ploop56 = [tmp3] // ploop56 = &loop56
167	ld8 ptable = [tmp4];; // ptable = &table
168	add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
169	mov ar.ec = MEMLAT + `1` + `1` // one more pass needed
170	ld8 tmp4 = [tmp3];; // tmp4 = loop offset
171	sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
172	ld8 r[`1`] = [asrc], `8`;; // w0
173	mov b6 = loopaddr;;
174	br b6 // jump to the appropriate loop
175
176	LOOP(`8`)
177	LOOP(`16`)
178	LOOP(`24`)
179	LOOP(`32`)
180	LOOP(`40`)
181	LOOP(`48`)
182	LOOP(`56`)
183
184	.src_aligned:
185	.l3:
186	(p[`0`]) ld8 r[`0`] = [src], `8`
187	(p[MEMLAT]) st8 [dest] = r[MEMLAT], `8`
188	br.ctop.dptk .l3
189	.cpyfew:
190	cmp.eq p6, p0 = len, r0 // is len == 0 ?
191	adds len = -`1`, len // --len;
192	(p6) br.cond.spnt .restore_and_exit ;;
193	mov ar.lc = len
194	.l4:
195	ld1 value = [src], `1`
196	;;
197	st1 [dest] = value, `1`
198	br.cloop.dptk .l4 ;;
199	.restore_and_exit:
200	mov pr = saved_pr, -`1` // restore the predicate registers
201	mov ar.lc = saved_lc // restore the loop counter
202	br.ret.sptk.many b0
203
204	// In the case of a backward copy, optimise only the case when everything
205	// is a multiple of 8, otherwise copy byte by byte. The backward copy is
206	// used only when the blocks are overlapping and dest > src.
207
208	.backward:
209	shr.u loopcnt = len, `3` // loopcnt = len / 8
210	add src = src, len // src points one byte past the end
211	add dest = dest, len ;; // dest points one byte past the end
212	mov ar.ec = MEMLAT + `1` // set the epilog counter
213	mov pr.rot = `1` << `16` // set rotating predicates
214	adds loopcnt = -`1`, loopcnt // --loopcnt
215	cmp.ne p6, p0 = tmp4, r0 // if ((dest \| src \| len) & 7 != 0)
216	(p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
217	adds src = -`8`, src // src points to the last word
218	adds dest = -`8`, dest // dest points to the last word
219	mov ar.lc = loopcnt;; // set the loop counter
220	.l5:
221	(p[`0`]) ld8 r[`0`] = [src], -`8`
222	(p[MEMLAT]) st8 [dest] = r[MEMLAT], -`8`
223	br.ctop.dptk .l5
224	br.cond.sptk .restore_and_exit
225	.bytecopy:
226	adds src = -`1`, src // src points to the last byte
227	adds dest = -`1`, dest // dest points to the last byte
228	adds loopcnt = -`1`, len;; // loopcnt = len - 1
229	mov ar.lc = loopcnt;; // set the loop counter
230	.l6:
231	(p[`0`]) ld1 r[`0`] = [src], -`1`
232	(p[MEMLAT]) st1 [dest] = r[MEMLAT], -`1`
233	br.ctop.dptk .l6
234	br.cond.sptk .restore_and_exit
235	END(memmove)
236
237	.rodata
238	.align `8`
239	.table:
240	data8 `0` // dummy entry
241	data8 .loop56 - .loop8
242	data8 .loop56 - .loop16
243	data8 .loop56 - .loop24
244	data8 .loop56 - .loop32
245	data8 .loop56 - .loop40
246	data8 .loop56 - .loop48
247	data8 .loop56 - .loop56
248
249	libc_hidden_builtin_def (memmove)
250

source code of glibc/sysdeps/ia64/memmove.S