memset.S source code [glibc/sysdeps/powerpc/powerpc64/power8/memset.S]

1	/ Optimized memset implementation for PowerPC64/POWER8.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ void * [r3] memset (void s [r3], int c [r4], size_t n [r5]));
22	Returns 's'. /*
23
24	#ifndef MEMSET
25	# define MEMSET memset
26	#endif
27	.machine power8
28	ENTRY_TOCLESS (MEMSET, `5`)
29	CALL_MCOUNT `3`
30
31	L(_memset):
32	cmpldi cr7,r5,`31`
33	neg r0,r3
34	mr r10,r3
35
36	insrdi r4,r4,`8`,`48`
37	insrdi r4,r4,`16`,`32` / Replicate byte to word. /
38	ble cr7,L(write_LT_32)
39
40	andi. r11,r10,`15` / Check alignment of DST. /
41	insrdi r4,r4,`32`,`0` / Replicate word to double word. /
42
43	beq L(big_aligned)
44
45	mtocrf `0x01`,r0
46	clrldi r0,r0,`60`
47
48	/ Get DST aligned to 16 bytes. /
49	`1`: bf `31`,`2f`
50	stb r4,`0`(r10)
51	addi r10,r10,`1`
52
53	`2`: bf `30`,`4f`
54	sth r4,`0`(r10)
55	addi r10,r10,`2`
56
57	`4`: bf `29`,`8f`
58	stw r4,`0`(r10)
59	addi r10,r10,`4`
60
61	`8`: bf `28`,`16f`
62	std r4,`0`(r10)
63	addi r10,r10,`8`
64
65	`16`: subf r5,r0,r5
66
67	.align `4`
68	L(big_aligned):
69	/ For sizes larger than 255 two possible paths:*
70	- if constant is '0', zero full cache lines with dcbz
71	- otherwise uses vector instructions. /*
72	cmpldi cr5,r5,`255`
73	dcbtst `0`,r10
74	cmpldi cr6,r4,`0`
75	crand `27`,`26`,`21`
76	bt `27`,L(huge_dcbz)
77	bge cr5,L(huge_vector)
78
79
80	/ Size between 32 and 255 bytes with constant different than 0, use*
81	doubleword store instruction to achieve best throughput. /*
82	srdi r8,r5,`5`
83	clrldi r11,r5,`59`
84	cmpldi cr6,r11,`0`
85	cmpdi r8,`0`
86	beq L(tail_bytes)
87	mtctr r8
88
89	/ Main aligned write loop, writes 32-bytes at a time. /
90	.align `4`
91	L(big_loop):
92	std r4,`0`(r10)
93	std r4,`8`(r10)
94	std r4,`16`(r10)
95	std r4,`24`(r10)
96	addi r10,r10,`32`
97	bdz L(tail_bytes)
98
99	std r4,`0`(r10)
100	std r4,`8`(r10)
101	std r4,`16`(r10)
102	std r4,`24`(r10)
103	addi r10,`10`,`32`
104	bdnz L(big_loop)
105
106	b L(tail_bytes)
107
108	/ Write remaining 1~31 bytes. /
109	.align `4`
110	L(tail_bytes):
111	beqlr cr6
112
113	srdi r7,r11,`4`
114	clrldi r8,r11,`60`
115	mtocrf `0x01`,r7
116
117	.align `4`
118	bf `31`,`8f`
119	std r4,`0`(r10)
120	std r4,`8`(r10)
121	addi r10,r10,`16`
122
123	.align `4`
124	`8`: mtocrf `0x1`,r8
125	bf `28`,`4f`
126	std r4,`0`(r10)
127	addi r10,r10,`8`
128
129	.align `4`
130	`4`: bf `29`,`2f`
131	stw `4`,`0`(`10`)
132	addi `10`,`10`,`4`
133
134	.align `4`
135	`2`: bf `30`,`1f`
136	sth `4`,`0`(`10`)
137	addi `10`,`10`,`2`
138
139	.align `4`
140	`1`: bflr `31`
141	stb `4`,`0`(`10`)
142	blr
143
144	/ Size larger than 255 bytes with constant different than 0, use*
145	vector instruction to achieve best throughput. /*
146	L(huge_vector):
147	/ Replicate set byte to quadword in VMX register. /
148	mtvsrd v1,r4
149	xxpermdi `32`,v0,v1,`0`
150	vspltb v2,v0,`15`
151
152	/ Main aligned write loop: 128 bytes at a time. /
153	li r6,`16`
154	li r7,`32`
155	li r8,`48`
156	mtocrf `0x02`,r5
157	srdi r12,r5,`7`
158	cmpdi r12,`0`
159	beq L(aligned_tail)
160	mtctr r12
161	b L(aligned_128loop)
162
163	.align `4`
164	L(aligned_128loop):
165	stvx v2,`0`,r10
166	stvx v2,r10,r6
167	stvx v2,r10,r7
168	stvx v2,r10,r8
169	addi r10,r10,`64`
170	stvx v2,`0`,r10
171	stvx v2,r10,r6
172	stvx v2,r10,r7
173	stvx v2,r10,r8
174	addi r10,r10,`64`
175	bdnz L(aligned_128loop)
176
177	/ Write remaining 1~127 bytes. /
178	L(aligned_tail):
179	mtocrf `0x01`,r5
180	bf `25`,`32f`
181	stvx v2,`0`,r10
182	stvx v2,r10,r6
183	stvx v2,r10,r7
184	stvx v2,r10,r8
185	addi r10,r10,`64`
186
187	`32`: bf `26`,`16f`
188	stvx v2,`0`,r10
189	stvx v2,r10,r6
190	addi r10,r10,`32`
191
192	`16`: bf `27`,`8f`
193	stvx v2,`0`,r10
194	addi r10,r10,`16`
195
196	`8`: bf `28`,`4f`
197	std r4,`0`(r10)
198	addi r10,r10,`8`
199
200	/ Copies 4~7 bytes. /
201	`4`: bf `29`,L(tail2)
202	stw r4,`0`(r10)
203	bf `30`,L(tail5)
204	sth r4,`4`(r10)
205	bflr `31`
206	stb r4,`6`(r10)
207	/ Return original DST pointer. /
208	blr
209
210	/ Special case when value is 0 and we have a long length to deal*
211	with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
212	Before using dcbz though, we need to get the destination 128-byte
213	aligned. /*
214	.align `4`
215	L(huge_dcbz):
216	andi. r11,r10,`127`
217	neg r0,r10
218	beq L(huge_dcbz_aligned)
219
220	clrldi r0,r0,`57`
221	subf r5,r0,r5
222	srdi r0,r0,`3`
223	mtocrf `0x01`,r0
224
225	/ Write 1~128 bytes until DST is aligned to 128 bytes. /
226	`8`: bf `28`,`4f`
227
228	std r4,`0`(r10)
229	std r4,`8`(r10)
230	std r4,`16`(r10)
231	std r4,`24`(r10)
232	std r4,`32`(r10)
233	std r4,`40`(r10)
234	std r4,`48`(r10)
235	std r4,`56`(r10)
236	addi r10,r10,`64`
237
238	.align `4`
239	`4`: bf `29`,`2f`
240	std r4,`0`(r10)
241	std r4,`8`(r10)
242	std r4,`16`(r10)
243	std r4,`24`(r10)
244	addi r10,r10,`32`
245
246	.align `4`
247	`2`: bf `30`,`1f`
248	std r4,`0`(r10)
249	std r4,`8`(r10)
250	addi r10,r10,`16`
251
252	.align `4`
253	`1`: bf `31`,L(huge_dcbz_aligned)
254	std r4,`0`(r10)
255	addi r10,r10,`8`
256
257	L(huge_dcbz_aligned):
258	/ Setup dcbz unroll offsets and count numbers. /
259	srdi r8,r5,`9`
260	clrldi r11,r5,`55`
261	cmpldi cr6,r11,`0`
262	li r9,`128`
263	cmpdi r8,`0`
264	beq L(huge_tail)
265	li r7,`256`
266	li r6,`384`
267	mtctr r8
268
269	.align `4`
270	L(huge_loop):
271	/ Sets 512 bytes to zero in each iteration, the loop unrolling shows*
272	a throughput boost for large sizes (2048 bytes or higher). /*
273	dcbz `0`,r10
274	dcbz r9,r10
275	dcbz r7,r10
276	dcbz r6,r10
277	addi r10,r10,`512`
278	bdnz L(huge_loop)
279
280	beqlr cr6
281
282	L(huge_tail):
283	srdi r6,r11,`8`
284	srdi r7,r11,`4`
285	clrldi r8,r11,`4`
286	cmpldi cr6,r8,`0`
287	mtocrf `0x01`,r6
288
289	beq cr6,L(tail)
290
291	/ We have 1~511 bytes remaining. /
292	.align `4`
293	`32`: bf `31`,`16f`
294	dcbz `0`,r10
295	dcbz r9,r10
296	addi r10,r10,`256`
297
298	.align `4`
299	`16`: mtocrf `0x01`,r7
300	bf `28`,`8f`
301	dcbz `0`,r10
302	addi r10,r10,`128`
303
304	.align `4`
305	`8`: bf `29`,`4f`
306	std r4,`0`(r10)
307	std r4,`8`(r10)
308	std r4,`16`(r10)
309	std r4,`24`(r10)
310	std r4,`32`(r10)
311	std r4,`40`(r10)
312	std r4,`48`(r10)
313	std r4,`56`(r10)
314	addi r10,r10,`64`
315
316	.align `4`
317	`4`: bf `30`,`2f`
318	std r4,`0`(r10)
319	std r4,`8`(r10)
320	std r4,`16`(r10)
321	std r4,`24`(r10)
322	addi r10,r10,`32`
323
324	.align `4`
325	`2`: bf `31`,L(tail)
326	std r4,`0`(r10)
327	std r4,`8`(r10)
328	addi r10,r10,`16`
329	.align `4`
330
331	/ Remaining 1~15 bytes. /
332	L(tail):
333	mtocrf `0x01`,r8
334
335	.align
336	`8`: bf `28`,`4f`
337	std r4,`0`(r10)
338	addi r10,r10,`8`
339
340	.align `4`
341	`4`: bf `29`,`2f`
342	stw r4,`0`(r10)
343	addi r10,r10,`4`
344
345	.align `4`
346	`2`: bf `30`,`1f`
347	sth r4,`0`(r10)
348	addi r10,r10,`2`
349
350	.align `4`
351	`1`: bflr `31`
352	stb r4,`0`(r10)
353	blr
354
355	/ Handle short copies of 0~31 bytes. Best throughput is achieved*
356	by just unrolling all operations. /*
357	.align `4`
358	L(write_LT_32):
359	cmpldi cr6,`5`,`8`
360	mtocrf `0x01`,r5
361	ble cr6,L(write_LE_8)
362
363	/ At least 9 bytes to go. /
364	neg r8,r4
365	andi. r0,r8,`3`
366	cmpldi cr1,r5,`16`
367	beq L(write_LT_32_aligned)
368
369	/ Force 4-byte alignment for SRC. /
370	mtocrf `0x01`,r0
371	subf r5,r0,r5
372
373	`2`: bf `30`,`1f`
374	/ Use stb instead of sth because it doesn't generate*
375	alignment interrupts on cache-inhibited storage. /*
376	stb r4,`0`(r10)
377	stb r4,`1`(r10)
378	addi r10,r10,`2`
379
380	`1`: bf `31`,L(end_4bytes_alignment)
381	stb r4,`0`(r10)
382	addi r10,r10,`1`
383
384	.align `4`
385	L(end_4bytes_alignment):
386	cmpldi cr1,r5,`16`
387	mtocrf `0x01`,r5
388
389	L(write_LT_32_aligned):
390	blt cr1,`8f`
391
392	stw r4,`0`(r10)
393	stw r4,`4`(r10)
394	stw r4,`8`(r10)
395	stw r4,`12`(r10)
396	addi r10,r10,`16`
397
398	`8`: bf `28`,L(tail4)
399	stw r4,`0`(r10)
400	stw r4,`4`(r10)
401	addi r10,r10,`8`
402
403	.align `4`
404	/ Copies 4~7 bytes. /
405	L(tail4):
406	bf `29`,L(tail2)
407	stw r4,`0`(r10)
408	bf `30`,L(tail5)
409	sth r4,`4`(r10)
410	bflr `31`
411	stb r4,`6`(r10)
412	blr
413
414	.align `4`
415	/ Copies 2~3 bytes. /
416	L(tail2):
417	bf `30`,`1f`
418	sth r4,`0`(r10)
419	bflr `31`
420	stb r4,`2`(r10)
421	blr
422
423	.align `4`
424	L(tail5):
425	bflr `31`
426	stb r4,`4`(r10)
427	blr
428
429	.align `4`
430	`1`: bflr `31`
431	stb r4,`0`(r10)
432	blr
433
434	/ Handles copies of 0~8 bytes. /
435	.align `4`
436	L(write_LE_8):
437	bne cr6,L(LE7_tail4)
438	/ If input is word aligned, use stw, else use stb. /
439	andi. r0,r10,`3`
440	bne L(`8_unalign`)
441
442	stw r4,`0`(r10)
443	stw r4,`4`(r10)
444	blr
445
446	/ Unaligned input and size is 8. /
447	.align `4`
448	L(`8_unalign`):
449	andi. r0,r10,`1`
450	beq L(`8_hwalign`)
451	stb r4,`0`(r10)
452	sth r4,`1`(r10)
453	sth r4,`3`(r10)
454	sth r4,`5`(r10)
455	stb r4,`7`(r10)
456	blr
457
458	/ Halfword aligned input and size is 8. /
459	.align `4`
460	L(`8_hwalign`):
461	sth r4,`0`(r10)
462	sth r4,`2`(r10)
463	sth r4,`4`(r10)
464	sth r4,`6`(r10)
465	blr
466
467	.align `4`
468	/ Copies 4~7 bytes. /
469	L(LE7_tail4):
470	/ Use stb instead of sth because it doesn't generate*
471	alignment interrupts on cache-inhibited storage. /*
472	bf `29`,L(LE7_tail2)
473	stb r4,`0`(r10)
474	stb r4,`1`(r10)
475	stb r4,`2`(r10)
476	stb r4,`3`(r10)
477	bf `30`,L(LE7_tail5)
478	stb r4,`4`(r10)
479	stb r4,`5`(r10)
480	bflr `31`
481	stb r4,`6`(r10)
482	blr
483
484	.align `4`
485	/ Copies 2~3 bytes. /
486	L(LE7_tail2):
487	bf `30`,`1f`
488	stb r4,`0`(r10)
489	stb r4,`1`(r10)
490	bflr `31`
491	stb r4,`2`(r10)
492	blr
493
494	.align `4`
495	L(LE7_tail5):
496	bflr `31`
497	stb r4,`4`(r10)
498	blr
499
500	.align `4`
501	`1`: bflr `31`
502	stb r4,`0`(r10)
503	blr
504
505	END_GEN_TB (MEMSET,TB_TOCLESS)
506	libc_hidden_builtin_def (memset)
507

source code of glibc/sysdeps/powerpc/powerpc64/power8/memset.S