memset_base64.S source code [glibc/sysdeps/aarch64/multiarch/memset_base64.S]

1	/ Copyright (C) 2018-2022 Free Software Foundation, Inc.*
2
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library. If not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "memset-reg.h"
21
22	#ifndef MEMSET
23	# define MEMSET __memset_base64
24	#endif
25
26	/ To disable DC ZVA, set this threshold to 0. /
27	#ifndef DC_ZVA_THRESHOLD
28	# define DC_ZVA_THRESHOLD 512
29	#endif
30
31	/ Assumptions:*
32	*
33	* ARMv8-a, AArch64, unaligned accesses
34	*
35	*/
36
37	ENTRY_ALIGN (MEMSET, `6`)
38
39	PTR_ARG (`0`)
40	SIZE_ARG (`2`)
41
42	bfi valw, valw, `8`, `8`
43	bfi valw, valw, `16`, `16`
44	bfi val, val, `32`, `32`
45
46	add dstend, dstin, count
47
48	cmp count, `96`
49	b.hi L(set_long)
50	cmp count, `16`
51	b.hs L(set_medium)
52
53	/ Set 0..15 bytes. /
54	tbz count, `3`, `1f`
55	str val, [dstin]
56	str val, [dstend, -`8`]
57	ret
58
59	.p2align `3`
60	`1`: tbz count, `2`, `2f`
61	str valw, [dstin]
62	str valw, [dstend, -`4`]
63	ret
64	`2`: cbz count, `3f`
65	strb valw, [dstin]
66	tbz count, `1`, `3f`
67	strh valw, [dstend, -`2`]
68	`3`: ret
69
70	.p2align `3`
71	/ Set 16..96 bytes. /
72	L(set_medium):
73	stp val, val, [dstin]
74	tbnz count, `6`, L(set96)
75	stp val, val, [dstend, -`16`]
76	tbz count, `5`, `1f`
77	stp val, val, [dstin, `16`]
78	stp val, val, [dstend, -`32`]
79	`1`: ret
80
81	.p2align `4`
82	/ Set 64..96 bytes. Write 64 bytes from the start and*
83	32 bytes from the end. /*
84	L(set96):
85	stp val, val, [dstin, `16`]
86	stp val, val, [dstin, `32`]
87	stp val, val, [dstin, `48`]
88	stp val, val, [dstend, -`32`]
89	stp val, val, [dstend, -`16`]
90	ret
91
92	.p2align `4`
93	L(set_long):
94	stp val, val, [dstin]
95	bic dst, dstin, `15`
96	#if DC_ZVA_THRESHOLD
97	cmp count, DC_ZVA_THRESHOLD
98	ccmp val, `0`, `0`, cs
99	b.eq L(zva_64)
100	#endif
101	/ Small-size or non-zero memset does not use DC ZVA. /
102	sub count, dstend, dst
103
104	/*
105	* Adjust count and bias for loop. By substracting extra 1 from count,
106	* it is easy to use tbz instruction to check whether loop tailing
107	* count is less than 33 bytes, so as to bypass 2 unneccesary stps.
108	*/
109	sub count, count, `64`+`16`+`1`
110
111	#if DC_ZVA_THRESHOLD
112	/ Align loop on 16-byte boundary, this might be friendly to i-cache. /
113	nop
114	#endif
115
116	`1`: stp val, val, [dst, `16`]
117	stp val, val, [dst, `32`]
118	stp val, val, [dst, `48`]
119	stp val, val, [dst, `64`]!
120	subs count, count, `64`
121	b.hs `1b`
122
123	tbz count, `5`, `1f` / Remaining count is less than 33 bytes? /
124	stp val, val, [dst, `16`]
125	stp val, val, [dst, `32`]
126	`1`: stp val, val, [dstend, -`32`]
127	stp val, val, [dstend, -`16`]
128	ret
129
130	#if DC_ZVA_THRESHOLD
131	.p2align `3`
132	L(zva_64):
133	stp val, val, [dst, `16`]
134	stp val, val, [dst, `32`]
135	stp val, val, [dst, `48`]
136	bic dst, dst, `63`
137
138	/*
139	* Previous memory writes might cross cache line boundary, and cause
140	* cache line partially dirty. Zeroing this kind of cache line using
141	* DC ZVA will incur extra cost, for it requires loading untouched
142	* part of the line from memory before zeoring.
143	*
144	* So, write the first 64 byte aligned block using stp to force
145	* fully dirty cache line.
146	*/
147	stp val, val, [dst, `64`]
148	stp val, val, [dst, `80`]
149	stp val, val, [dst, `96`]
150	stp val, val, [dst, `112`]
151
152	sub count, dstend, dst
153	/*
154	* Adjust count and bias for loop. By substracting extra 1 from count,
155	* it is easy to use tbz instruction to check whether loop tailing
156	* count is less than 33 bytes, so as to bypass 2 unneccesary stps.
157	*/
158	sub count, count, `128`+`64`+`64`+`1`
159	add dst, dst, `128`
160	nop
161
162	/ DC ZVA sets 64 bytes each time. /
163	`1`: dc zva, dst
164	add dst, dst, `64`
165	subs count, count, `64`
166	b.hs `1b`
167
168	/*
169	* Write the last 64 byte aligned block using stp to force fully
170	* dirty cache line.
171	*/
172	stp val, val, [dst, `0`]
173	stp val, val, [dst, `16`]
174	stp val, val, [dst, `32`]
175	stp val, val, [dst, `48`]
176
177	tbz count, `5`, `1f` / Remaining count is less than 33 bytes? /
178	stp val, val, [dst, `64`]
179	stp val, val, [dst, `80`]
180	`1`: stp val, val, [dstend, -`32`]
181	stp val, val, [dstend, -`16`]
182	ret
183	#endif
184
185	END (MEMSET)
186	libc_hidden_builtin_def (MEMSET)
187

source code of glibc/sysdeps/aarch64/multiarch/memset_base64.S