memset.S source code [glibc/sysdeps/powerpc/powerpc64/power6/memset.S]

1	/ Optimized 64-bit memset implementation for POWER6.*
2	Copyright (C) 1997-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ void * [r3] memset (void s [r3], int c [r4], size_t n [r5]));
22	Returns 's'.
23
24	The memset is done in three sizes: byte (8 bits), word (32 bits),
25	cache line (256 bits). There is a special case for setting cache lines
26	to 0, to take advantage of the dcbz instruction. /*
27
28	#ifndef MEMSET
29	# define MEMSET memset
30	#endif
31	.machine power6
32	ENTRY_TOCLESS (MEMSET, `7`)
33	CALL_MCOUNT `3`
34
35	#define rTMP r0
36	#define rRTN r3 /* Initial value of 1st argument. */
37	#define rMEMP0 r3 /* Original value of 1st arg. */
38	#define rCHR r4 /* Char to set in each byte. */
39	#define rLEN r5 /* Length of region to set. */
40	#define rMEMP r6 /* Address at which we are storing. */
41	#define rALIGN r7 /* Number of bytes we are setting now (when aligning). */
42	#define rMEMP2 r8
43	#define rMEMP3 r9 /* Alt mem pointer. */
44	L(_memset):
45	/ Take care of case for size <= 4. /
46	cmpldi cr1, rLEN, `8`
47	andi. rALIGN, rMEMP0, `7`
48	mr rMEMP, rMEMP0
49	ble cr1, L(small)
50
51	/ Align to doubleword boundary. /
52	cmpldi cr5, rLEN, `31`
53	insrdi rCHR, rCHR, `8`, `48` / Replicate byte to halfword. /
54	beq+ L(aligned2)
55	mtcrf `0x01`, rMEMP0
56	subfic rALIGN, rALIGN, `8`
57	cror `28`,`30`,`31` / Detect odd word aligned. /
58	add rMEMP, rMEMP, rALIGN
59	sub rLEN, rLEN, rALIGN
60	insrdi rCHR, rCHR, `16`, `32` / Replicate halfword to word. /
61	bt `29`, L(g4)
62	/ Process the even word of doubleword. /
63	bf+ `31`, L(g2)
64	stb rCHR, `0`(rMEMP0)
65	bt `30`, L(g4x)
66	L(g2):
67	sth rCHR, -`6`(rMEMP)
68	L(g4x):
69	stw rCHR, -`4`(rMEMP)
70	b L(aligned)
71	/ Process the odd word of doubleword. /
72	L(g4):
73	bf `28`, L(g4x) / If false, word aligned on odd word. /
74	bf+ `31`, L(g0)
75	stb rCHR, `0`(rMEMP0)
76	bt `30`, L(aligned)
77	L(g0):
78	sth rCHR, -`2`(rMEMP)
79
80	/ Handle the case of size < 31. /
81	L(aligned2):
82	insrdi rCHR, rCHR, `16`, `32` / Replicate halfword to word. /
83	L(aligned):
84	mtcrf `0x01`, rLEN
85	ble cr5, L(medium)
86	/ Align to 32-byte boundary. /
87	andi. rALIGN, rMEMP, `0x18`
88	subfic rALIGN, rALIGN, `0x20`
89	insrdi rCHR, rCHR, `32`, `0` / Replicate word to double word. /
90	beq L(caligned)
91	mtcrf `0x01`, rALIGN
92	add rMEMP, rMEMP, rALIGN
93	sub rLEN, rLEN, rALIGN
94	cmplwi cr1, rALIGN, `0x10`
95	mr rMEMP2, rMEMP
96	bf `28`, L(a1)
97	stdu rCHR, -`8`(rMEMP2)
98	L(a1): blt cr1, L(a2)
99	std rCHR, -`8`(rMEMP2)
100	stdu rCHR, -`16`(rMEMP2)
101	L(a2):
102
103	/ Now aligned to a 32 byte boundary. /
104	.align `4`
105	L(caligned):
106	cmpldi cr1, rCHR, `0`
107	clrrdi. rALIGN, rLEN, `5`
108	mtcrf `0x01`, rLEN
109	beq cr1, L(zloopstart) / Special case for clearing memory using dcbz. /
110	beq L(medium) / We may not actually get to do a full line. /
111	.align `4`
112	/ Storing a non-zero "c" value. We are aligned at a sector (32-byte)*
113	boundary may not be at cache line (128-byte) boundary. /*
114	L(nzloopstart):
115	/ memset in 32-byte chunks until we get to a cache line boundary.*
116	If rLEN is less than the distance to the next cache-line boundary use
117	cacheAligned1 code to finish the tail. /*
118	cmpldi cr1,rLEN,`128`
119
120	andi. rTMP,rMEMP,`127`
121	blt cr1,L(cacheAligned1)
122	addi rMEMP3,rMEMP,`32`
123	beq L(nzCacheAligned)
124	addi rLEN,rLEN,-`32`
125	std rCHR,`0`(rMEMP)
126	std rCHR,`8`(rMEMP)
127	std rCHR,`16`(rMEMP)
128	addi rMEMP,rMEMP,`32`
129	andi. rTMP,rMEMP3,`127`
130	std rCHR,-`8`(rMEMP3)
131
132	beq L(nzCacheAligned)
133	addi rLEN,rLEN,-`32`
134	std rCHR,`0`(rMEMP3)
135	addi rMEMP,rMEMP,`32`
136	std rCHR,`8`(rMEMP3)
137	andi. rTMP,rMEMP,`127`
138	std rCHR,`16`(rMEMP3)
139	std rCHR,`24`(rMEMP3)
140
141	beq L(nzCacheAligned)
142	addi rLEN,rLEN,-`32`
143	std rCHR,`32`(rMEMP3)
144	addi rMEMP,rMEMP,`32`
145	cmpldi cr1,rLEN,`128`
146	std rCHR,`40`(rMEMP3)
147	cmpldi cr6,rLEN,`256`
148	li rMEMP2,`128`
149	std rCHR,`48`(rMEMP3)
150	std rCHR,`56`(rMEMP3)
151	blt cr1,L(cacheAligned1)
152	b L(nzCacheAligned128)
153
154	/ Now we are aligned to the cache line and can use dcbtst. /
155	.align `4`
156	L(nzCacheAligned):
157	cmpldi cr1,rLEN,`128`
158	blt cr1,L(cacheAligned1)
159	b L(nzCacheAligned128)
160	.align `5`
161	L(nzCacheAligned128):
162	cmpldi cr1,rLEN,`256`
163	addi rMEMP3,rMEMP,`64`
164	std rCHR,`0`(rMEMP)
165	std rCHR,`8`(rMEMP)
166	std rCHR,`16`(rMEMP)
167	std rCHR,`24`(rMEMP)
168	std rCHR,`32`(rMEMP)
169	std rCHR,`40`(rMEMP)
170	std rCHR,`48`(rMEMP)
171	std rCHR,`56`(rMEMP)
172	addi rMEMP,rMEMP3,`64`
173	addi rLEN,rLEN,-`128`
174	std rCHR,`0`(rMEMP3)
175	std rCHR,`8`(rMEMP3)
176	std rCHR,`16`(rMEMP3)
177	std rCHR,`24`(rMEMP3)
178	std rCHR,`32`(rMEMP3)
179	std rCHR,`40`(rMEMP3)
180	std rCHR,`48`(rMEMP3)
181	std rCHR,`56`(rMEMP3)
182	bge cr1,L(nzCacheAligned128)
183	dcbtst `0`,rMEMP
184	b L(cacheAligned1)
185	.align `5`
186	/ Storing a zero "c" value. We are aligned at a sector (32-byte)*
187	boundary but may not be at cache line (128-byte) boundary. If the
188	remaining length spans a full cache line we can use the Data cache
189	block zero instruction. /*
190	L(zloopstart):
191	/ memset in 32-byte chunks until we get to a cache line boundary.*
192	If rLEN is less than the distance to the next cache-line boundary use
193	cacheAligned1 code to finish the tail. /*
194	cmpldi cr1,rLEN,`128`
195	beq L(medium)
196	L(getCacheAligned):
197	andi. rTMP,rMEMP,`127`
198	nop
199	blt cr1,L(cacheAligned1)
200	addi rMEMP3,rMEMP,`32`
201	beq L(cacheAligned)
202	addi rLEN,rLEN,-`32`
203	std rCHR,`0`(rMEMP)
204	std rCHR,`8`(rMEMP)
205	std rCHR,`16`(rMEMP)
206	addi rMEMP,rMEMP,`32`
207	andi. rTMP,rMEMP3,`127`
208	std rCHR,-`8`(rMEMP3)
209	L(getCacheAligned2):
210	beq L(cacheAligned)
211	addi rLEN,rLEN,-`32`
212	std rCHR,`0`(rMEMP3)
213	std rCHR,`8`(rMEMP3)
214	addi rMEMP,rMEMP,`32`
215	andi. rTMP,rMEMP,`127`
216	std rCHR,`16`(rMEMP3)
217	std rCHR,`24`(rMEMP3)
218	L(getCacheAligned3):
219	beq L(cacheAligned)
220	addi rLEN,rLEN,-`32`
221	std rCHR,`32`(rMEMP3)
222	addi rMEMP,rMEMP,`32`
223	cmpldi cr1,rLEN,`128`
224	std rCHR,`40`(rMEMP3)
225	cmpldi cr6,rLEN,`256`
226	li rMEMP2,`128`
227	std rCHR,`48`(rMEMP3)
228	std rCHR,`56`(rMEMP3)
229	blt cr1,L(cacheAligned1)
230	blt cr6,L(cacheAligned128)
231	b L(cacheAlignedx)
232
233	/ Now we are aligned to the cache line and can use dcbz. /
234	.align `5`
235	L(cacheAligned):
236	cmpldi cr1,rLEN,`128`
237	cmpldi cr6,rLEN,`256`
238	blt cr1,L(cacheAligned1)
239	li rMEMP2,`128`
240	L(cacheAlignedx):
241	cmpldi cr5,rLEN,`640`
242	blt cr6,L(cacheAligned128)
243	bgt cr5,L(cacheAligned512)
244	cmpldi cr6,rLEN,`512`
245	dcbz `0`,rMEMP
246	cmpldi cr1,rLEN,`384`
247	dcbz rMEMP2,rMEMP
248	addi rMEMP,rMEMP,`256`
249	addi rLEN,rLEN,-`256`
250	blt cr1,L(cacheAligned1)
251	blt cr6,L(cacheAligned128)
252	b L(cacheAligned256)
253	.align `5`
254	/ A simple loop for the longer (>640 bytes) lengths. This form limits*
255	the branch miss-predicted to exactly 1 at loop exit./*
256	L(cacheAligned512):
257	cmpldi cr1,rLEN,`128`
258	blt cr1,L(cacheAligned1)
259	dcbz `0`,rMEMP
260	addi rLEN,rLEN,-`128`
261	addi rMEMP,rMEMP,`128`
262	b L(cacheAligned512)
263	.align `5`
264	L(cacheAligned256):
265
266	cmpldi cr6,rLEN,`512`
267
268	dcbz `0`,rMEMP
269	cmpldi cr1,rLEN,`384`
270	dcbz rMEMP2,rMEMP
271	addi rMEMP,rMEMP,`256`
272	addi rLEN,rLEN,-`256`
273
274	bge cr6,L(cacheAligned256)
275
276	blt cr1,L(cacheAligned1)
277	.align `4`
278	L(cacheAligned128):
279	dcbz `0`,rMEMP
280	addi rMEMP,rMEMP,`128`
281	addi rLEN,rLEN,-`128`
282	nop
283	L(cacheAligned1):
284	cmpldi cr1,rLEN,`32`
285	blt cr1,L(handletail32)
286	addi rMEMP3,rMEMP,`32`
287	addi rLEN,rLEN,-`32`
288	std rCHR,`0`(rMEMP)
289	std rCHR,`8`(rMEMP)
290	std rCHR,`16`(rMEMP)
291	addi rMEMP,rMEMP,`32`
292	cmpldi cr1,rLEN,`32`
293	std rCHR,-`8`(rMEMP3)
294	L(cacheAligned2):
295	blt cr1,L(handletail32)
296	addi rLEN,rLEN,-`32`
297	std rCHR,`0`(rMEMP3)
298	std rCHR,`8`(rMEMP3)
299	addi rMEMP,rMEMP,`32`
300	cmpldi cr1,rLEN,`32`
301	std rCHR,`16`(rMEMP3)
302	std rCHR,`24`(rMEMP3)
303	nop
304	L(cacheAligned3):
305	blt cr1,L(handletail32)
306	addi rMEMP,rMEMP,`32`
307	addi rLEN,rLEN,-`32`
308	std rCHR,`32`(rMEMP3)
309	std rCHR,`40`(rMEMP3)
310	std rCHR,`48`(rMEMP3)
311	std rCHR,`56`(rMEMP3)
312
313	/ We are here because the length or remainder (rLEN) is less than the*
314	cache line/sector size and does not justify aggressive loop unrolling.
315	So set up the preconditions for L(medium) and go there. /*
316	.align `3`
317	L(handletail32):
318	cmpldi cr1,rLEN,`0`
319	beqlr cr1
320	b L(medium)
321
322	.align `5`
323	L(small):
324	/ Memset of 8 bytes or less. /
325	cmpldi cr6, rLEN, `4`
326	cmpldi cr5, rLEN, `1`
327	ble cr6,L(le4)
328	subi rLEN, rLEN, `4`
329	stb rCHR,`0`(rMEMP)
330	stb rCHR,`1`(rMEMP)
331	stb rCHR,`2`(rMEMP)
332	stb rCHR,`3`(rMEMP)
333	addi rMEMP,rMEMP, `4`
334	cmpldi cr5, rLEN, `1`
335	L(le4):
336	cmpldi cr1, rLEN, `3`
337	bltlr cr5
338	stb rCHR, `0`(rMEMP)
339	beqlr cr5
340	stb rCHR, `1`(rMEMP)
341	bltlr cr1
342	stb rCHR, `2`(rMEMP)
343	beqlr cr1
344	stb rCHR, `3`(rMEMP)
345	blr
346
347	/ Memset of 0-31 bytes. /
348	.align `5`
349	L(medium):
350	insrdi rCHR, rCHR, `32`, `0` / Replicate word to double word. /
351	cmpldi cr1, rLEN, `16`
352	L(medium_tail2):
353	add rMEMP, rMEMP, rLEN
354	L(medium_tail):
355	bt- `31`, L(medium_31t)
356	bt- `30`, L(medium_30t)
357	L(medium_30f):
358	bt `29`, L(medium_29t)
359	L(medium_29f):
360	bge cr1, L(medium_27t)
361	bflr `28`
362	std rCHR, -`8`(rMEMP)
363	blr
364
365	L(medium_31t):
366	stbu rCHR, -`1`(rMEMP)
367	bf- `30`, L(medium_30f)
368	L(medium_30t):
369	sthu rCHR, -`2`(rMEMP)
370	bf- `29`, L(medium_29f)
371	L(medium_29t):
372	stwu rCHR, -`4`(rMEMP)
373	blt cr1, L(medium_27f)
374	L(medium_27t):
375	std rCHR, -`8`(rMEMP)
376	stdu rCHR, -`16`(rMEMP)
377	L(medium_27f):
378	bflr `28`
379	L(medium_28t):
380	std rCHR, -`8`(rMEMP)
381	blr
382	END_GEN_TB (MEMSET,TB_TOCLESS)
383	libc_hidden_builtin_def (memset)
384

source code of glibc/sysdeps/powerpc/powerpc64/power6/memset.S