bzero.S source code [glibc/sysdeps/ia64/bzero.S]

1	/ Optimized version of the standard bzero() function.*
2	This file is part of the GNU C Library.
3	Copyright (C) 2000-2022 Free Software Foundation, Inc.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ Return: dest*
20
21	Inputs:
22	in0: dest
23	in1: count
24
25	The algorithm is fairly straightforward: set byte by byte until we
26	we get to a 16B-aligned address, then loop on 128 B chunks using an
27	early store as prefetching, then loop on 32B chucks, then clear remaining
28	words, finally clear remaining bytes.
29	Since a stf.spill f0 can store 16B in one go, we use this instruction
30	to get peak speed. /*
31
32	#include <sysdep.h>
33	#undef ret
34
35	#define dest in0
36	#define cnt in1
37
38	#define tmp r31
39	#define save_lc r30
40	#define ptr0 r29
41	#define ptr1 r28
42	#define ptr2 r27
43	#define ptr3 r26
44	#define ptr9 r24
45	#define loopcnt r23
46	#define linecnt r22
47	#define bytecnt r21
48
49	// This routine uses only scratch predicate registers (p6 - p15)
50	#define p_scr p6 // default register for same-cycle branches
51	#define p_unalgn p9
52	#define p_y p11
53	#define p_n p12
54	#define p_yy p13
55	#define p_nn p14
56
57	#define movi0 mov
58
59	#define MIN1 15
60	#define MIN1P1HALF 8
61	#define LINE_SIZE 128
62	#define LSIZE_SH 7 // shift amount
63	#define PREF_AHEAD 8
64
65	#define USE_FLP
66	#if defined(USE_INT)
67	#define store st8
68	#define myval r0
69	#elif defined(USE_FLP)
70	#define store stf8
71	#define myval f0
72	#endif
73
74	.align `64`
75	ENTRY(bzero)
76	{ .mmi
77	.prologue
78	alloc tmp = ar.pfs, `2`, `0`, `0`, `0`
79	lfetch.nt1 [dest]
80	.save ar.lc, save_lc
81	movi0 save_lc = ar.lc
82	} { .mmi
83	.body
84	mov ret0 = dest // return value
85	nop.m `0`
86	cmp.eq p_scr, p0 = cnt, r0
87	;; }
88	{ .mmi
89	and ptr2 = -(MIN1+`1`), dest // aligned address
90	and tmp = MIN1, dest // prepare to check for alignment
91	tbit.nz p_y, p_n = dest, `0` // Do we have an odd address? (M_B_U)
92	} { .mib
93	mov ptr1 = dest
94	nop.i `0`
95	(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
96	;; }
97	{ .mib
98	cmp.ne p_unalgn, p0 = tmp, r0
99	} { .mib // NB: # of bytes to move is 1
100	sub bytecnt = (MIN1+`1`), tmp // higher than loopcnt
101	cmp.gt p_scr, p0 = `16`, cnt // is it a minimalistic task?
102	(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
103	;; }
104	{ .mmi
105	(p_unalgn) add ptr1 = (MIN1+`1`), ptr2 // after alignment
106	(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
107	(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, `3` // should we do a st8 ?
108	;; }
109	{ .mib
110	(p_y) add cnt = -`8`, cnt
111	(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, `2` // should we do a st4 ?
112	} { .mib
113	(p_y) st8 [ptr2] = r0,-`4`
114	(p_n) add ptr2 = `4`, ptr2
115	;; }
116	{ .mib
117	(p_yy) add cnt = -`4`, cnt
118	(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, `1` // should we do a st2 ?
119	} { .mib
120	(p_yy) st4 [ptr2] = r0,-`2`
121	(p_nn) add ptr2 = `2`, ptr2
122	;; }
123	{ .mmi
124	mov tmp = LINE_SIZE+`1` // for compare
125	(p_y) add cnt = -`2`, cnt
126	(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, `0` // should we do a st1 ?
127	} { .mmi
128	nop.m `0`
129	(p_y) st2 [ptr2] = r0,-`1`
130	(p_n) add ptr2 = `1`, ptr2
131	;; }
132
133	{ .mmi
134	(p_yy) st1 [ptr2] = r0
135	cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
136	} { .mbb
137	(p_yy) add cnt = -`1`, cnt
138	(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
139	;; }
140	{ .mib
141	nop.m `0`
142	shr.u linecnt = cnt, LSIZE_SH
143	nop.b `0`
144	;; }
145
146	.align `32`
147	.l1b: // ------------------// L1B: store ahead into cache lines; fill later
148	{ .mmi
149	and tmp = -(LINE_SIZE), cnt // compute end of range
150	mov ptr9 = ptr1 // used for prefetching
151	and cnt = (LINE_SIZE-`1`), cnt // remainder
152	} { .mmi
153	mov loopcnt = PREF_AHEAD-`1` // default prefetch loop
154	cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
155	;; }
156	{ .mmi
157	(p_scr) add loopcnt = -`1`, linecnt
158	add ptr2 = `16`, ptr1 // start of stores (beyond prefetch stores)
159	add ptr1 = tmp, ptr1 // first address beyond total range
160	;; }
161	{ .mmi
162	add tmp = -`1`, linecnt // next loop count
163	movi0 ar.lc = loopcnt
164	;; }
165	.pref_l1b:
166	{ .mib
167	stf.spill [ptr9] = f0, `128` // Do stores one cache line apart
168	nop.i `0`
169	br.cloop.dptk.few .pref_l1b
170	;; }
171	{ .mmi
172	add ptr0 = `16`, ptr2 // Two stores in parallel
173	movi0 ar.lc = tmp
174	;; }
175	.l1bx:
176	{ .mmi
177	stf.spill [ptr2] = f0, `32`
178	stf.spill [ptr0] = f0, `32`
179	;; }
180	{ .mmi
181	stf.spill [ptr2] = f0, `32`
182	stf.spill [ptr0] = f0, `32`
183	;; }
184	{ .mmi
185	stf.spill [ptr2] = f0, `32`
186	stf.spill [ptr0] = f0, `64`
187	cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
188	;; }
189	{ .mmb
190	stf.spill [ptr2] = f0, `32`
191	(p_scr) stf.spill [ptr9] = f0, `128`
192	br.cloop.dptk.few .l1bx
193	;; }
194	{ .mib
195	cmp.gt p_scr, p0 = `8`, cnt // just a few bytes left ?
196	(p_scr) br.cond.dpnt.many .move_bytes_from_alignment
197	;; }
198
199	.fraction_of_line:
200	{ .mib
201	add ptr2 = `16`, ptr1
202	shr.u loopcnt = cnt, `5` // loopcnt = cnt / 32
203	;; }
204	{ .mib
205	cmp.eq p_scr, p0 = loopcnt, r0
206	add loopcnt = -`1`, loopcnt
207	(p_scr) br.cond.dpnt.many .store_words
208	;; }
209	{ .mib
210	and cnt = `0x1f`, cnt // compute the remaining cnt
211	movi0 ar.lc = loopcnt
212	;; }
213	.align `32`
214	.l2: // -----------------------------// L2A: store 32B in 2 cycles
215	{ .mmb
216	store [ptr1] = myval, `8`
217	store [ptr2] = myval, `8`
218	;; } { .mmb
219	store [ptr1] = myval, `24`
220	store [ptr2] = myval, `24`
221	br.cloop.dptk.many .l2
222	;; }
223	.store_words:
224	{ .mib
225	cmp.gt p_scr, p0 = `8`, cnt // just a few bytes left ?
226	(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
227	;; }
228
229	{ .mmi
230	store [ptr1] = myval, `8` // store
231	cmp.le p_y, p_n = `16`, cnt //
232	add cnt = -`8`, cnt // subtract
233	;; }
234	{ .mmi
235	(p_y) store [ptr1] = myval, `8` // store
236	(p_y) cmp.le.unc p_yy, p_nn = `16`, cnt
237	(p_y) add cnt = -`8`, cnt // subtract
238	;; }
239	{ .mmi // store
240	(p_yy) store [ptr1] = myval, `8`
241	(p_yy) add cnt = -`8`, cnt // subtract
242	;; }
243
244	.move_bytes_from_alignment:
245	{ .mib
246	cmp.eq p_scr, p0 = cnt, r0
247	tbit.nz.unc p_y, p0 = cnt, `2` // should we terminate with a st4 ?
248	(p_scr) br.cond.dpnt.few .restore_and_exit
249	;; }
250	{ .mib
251	(p_y) st4 [ptr1] = r0,`4`
252	tbit.nz.unc p_yy, p0 = cnt, `1` // should we terminate with a st2 ?
253	;; }
254	{ .mib
255	(p_yy) st2 [ptr1] = r0,`2`
256	tbit.nz.unc p_y, p0 = cnt, `0` // should we terminate with a st1 ?
257	;; }
258
259	{ .mib
260	(p_y) st1 [ptr1] = r0
261	;; }
262	.restore_and_exit:
263	{ .mib
264	nop.m `0`
265	movi0 ar.lc = save_lc
266	br.ret.sptk.many rp
267	;; }
268
269	.move_bytes_unaligned:
270	{ .mmi
271	.pred.rel "mutex",p_y, p_n
272	.pred.rel "mutex",p_yy, p_nn
273	(p_n) cmp.le p_yy, p_nn = `4`, cnt
274	(p_y) cmp.le p_yy, p_nn = `5`, cnt
275	(p_n) add ptr2 = `2`, ptr1
276	} { .mmi
277	(p_y) add ptr2 = `3`, ptr1
278	(p_y) st1 [ptr1] = r0, `1` // fill 1 (odd-aligned) byte
279	(p_y) add cnt = -`1`, cnt // [15, 14 (or less) left]
280	;; }
281	{ .mmi
282	(p_yy) cmp.le.unc p_y, p0 = `8`, cnt
283	add ptr3 = ptr1, cnt // prepare last store
284	movi0 ar.lc = save_lc
285	} { .mmi
286	(p_yy) st2 [ptr1] = r0, `4` // fill 2 (aligned) bytes
287	(p_yy) st2 [ptr2] = r0, `4` // fill 2 (aligned) bytes
288	(p_yy) add cnt = -`4`, cnt // [11, 10 (o less) left]
289	;; }
290	{ .mmi
291	(p_y) cmp.le.unc p_yy, p0 = `8`, cnt
292	add ptr3 = -`1`, ptr3 // last store
293	tbit.nz p_scr, p0 = cnt, `1` // will there be a st2 at the end ?
294	} { .mmi
295	(p_y) st2 [ptr1] = r0, `4` // fill 2 (aligned) bytes
296	(p_y) st2 [ptr2] = r0, `4` // fill 2 (aligned) bytes
297	(p_y) add cnt = -`4`, cnt // [7, 6 (or less) left]
298	;; }
299	{ .mmi
300	(p_yy) st2 [ptr1] = r0, `4` // fill 2 (aligned) bytes
301	(p_yy) st2 [ptr2] = r0, `4` // fill 2 (aligned) bytes
302	// [3, 2 (or less) left]
303	tbit.nz p_y, p0 = cnt, `0` // will there be a st1 at the end ?
304	} { .mmi
305	(p_yy) add cnt = -`4`, cnt
306	;; }
307	{ .mmb
308	(p_scr) st2 [ptr1] = r0 // fill 2 (aligned) bytes
309	(p_y) st1 [ptr3] = r0 // fill last byte (using ptr3)
310	br.ret.sptk.many rp
311	;; }
312	END(bzero)
313

source code of glibc/sysdeps/ia64/bzero.S