memset_a64fx.S source code [glibc/sysdeps/aarch64/multiarch/memset_a64fx.S]

1	/ Optimized memset for Fujitsu A64FX processor.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library. If not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21	#include <sysdeps/aarch64/memset-reg.h>
22
23	/ Assumptions:*
24	*
25	* ARMv8.2-a, AArch64, unaligned accesses, sve
26	*
27	*/
28
29	#define L1_SIZE (64*1024) // L1 64KB
30	#define L2_SIZE (810241024) // L2 8MB
31	#define CACHE_LINE_SIZE 256
32	#define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
33	#define vector_length x9
34
35	#if HAVE_AARCH64_SVE_ASM
36
37	.arch armv8`.2`-a+sve
38
39	.macro st1b_unroll first=`0`, last=`7`
40	st1b z0.b, p0, [dst, \first, mul vl]
41	.if \last-\first
42	st1b_unroll "(\first+1)", \last
43	.endif
44	.endm
45
46
47	#undef BTI_C
48	#define BTI_C
49
50	ENTRY (__memset_a64fx)
51	PTR_ARG (`0`)
52	SIZE_ARG (`2`)
53
54	cntb vector_length
55	dup z0.b, valw
56	whilelo p0.b, vector_length, count
57	b.last `1f`
58	whilelo p1.b, xzr, count
59	st1b z0.b, p1, [dstin, `0`, mul vl]
60	st1b z0.b, p0, [dstin, `1`, mul vl]
61	ret
62
63	// count >= vector_length 2*
64	`1`: cmp count, vector_length, lsl `2`
65	add dstend, dstin, count
66	b.hi `1f`
67	st1b z0.b, p0, [dstin, `0`, mul vl]
68	st1b z0.b, p0, [dstin, `1`, mul vl]
69	st1b z0.b, p0, [dstend, -`2`, mul vl]
70	st1b z0.b, p0, [dstend, -`1`, mul vl]
71	ret
72
73	// count > vector_length 4*
74	`1`: lsl tmp1, vector_length, `3`
75	cmp count, tmp1
76	b.hi L(vl_agnostic)
77	st1b z0.b, p0, [dstin, `0`, mul vl]
78	st1b z0.b, p0, [dstin, `1`, mul vl]
79	st1b z0.b, p0, [dstin, `2`, mul vl]
80	st1b z0.b, p0, [dstin, `3`, mul vl]
81	st1b z0.b, p0, [dstend, -`4`, mul vl]
82	st1b z0.b, p0, [dstend, -`3`, mul vl]
83	st1b z0.b, p0, [dstend, -`2`, mul vl]
84	st1b z0.b, p0, [dstend, -`1`, mul vl]
85	ret
86
87	.p2align `4`
88	L(vl_agnostic): // VL Agnostic
89	mov dst, dstin
90	cmp count, L1_SIZE
91	b.hi L(L1_prefetch)
92
93	// count >= 8 vector_length*
94	L(unroll8):
95	sub count, count, tmp1
96	.p2align `4`
97	// The 2 instructions at the beginning of the following loop,
98	// cmp and branch, are a workaround so as not to degrade at
99	// the peak performance 16KB.
100	// It is found heuristically and the branch condition, b.ne,
101	// is chosen intentionally never to jump.
102	`1`: cmp xzr, xzr
103	b.ne `1b`
104	st1b_unroll `0`, `7`
105	add dst, dst, tmp1
106	subs count, count, tmp1
107	b.hi `1b`
108	add count, count, tmp1
109
110	L(last):
111	cmp count, vector_length, lsl `1`
112	b.ls `2f`
113	add tmp2, vector_length, vector_length, lsl `2`
114	cmp count, tmp2
115	b.ls `5f`
116	st1b z0.b, p0, [dstend, -`8`, mul vl]
117	st1b z0.b, p0, [dstend, -`7`, mul vl]
118	st1b z0.b, p0, [dstend, -`6`, mul vl]
119	`5`: st1b z0.b, p0, [dstend, -`5`, mul vl]
120	st1b z0.b, p0, [dstend, -`4`, mul vl]
121	st1b z0.b, p0, [dstend, -`3`, mul vl]
122	`2`: st1b z0.b, p0, [dstend, -`2`, mul vl]
123	st1b z0.b, p0, [dstend, -`1`, mul vl]
124	ret
125
126	// count >= L1_SIZE
127	.p2align `3`
128	L(L1_prefetch):
129	cmp count, L2_SIZE
130	b.hs L(L2)
131	cmp vector_length, `64`
132	b.ne L(unroll8)
133	`1`: st1b_unroll `0`, `3`
134	prfm pstl1keep, [dst, PF_DIST_L1]
135	st1b_unroll `4`, `7`
136	prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
137	add dst, dst, CACHE_LINE_SIZE * `2`
138	sub count, count, CACHE_LINE_SIZE * `2`
139	cmp count, PF_DIST_L1
140	b.hs `1b`
141	b L(unroll8)
142
143	// count >= L2_SIZE
144	.p2align `3`
145	L(L2):
146	tst valw, `255`
147	b.ne L(unroll8)
148	// align dst to CACHE_LINE_SIZE byte boundary
149	and tmp2, dst, CACHE_LINE_SIZE - `1`
150	st1b z0.b, p0, [dst, `0`, mul vl]
151	st1b z0.b, p0, [dst, `1`, mul vl]
152	st1b z0.b, p0, [dst, `2`, mul vl]
153	st1b z0.b, p0, [dst, `3`, mul vl]
154	sub dst, dst, tmp2
155	add count, count, tmp2
156
157	// clear cachelines using DC ZVA
158	sub count, count, CACHE_LINE_SIZE * `2`
159	.p2align `4`
160	`1`: add dst, dst, CACHE_LINE_SIZE
161	dc zva, dst
162	subs count, count, CACHE_LINE_SIZE
163	b.hi `1b`
164	add count, count, CACHE_LINE_SIZE
165	b L(last)
166
167	END (__memset_a64fx)
168
169	#endif /* HAVE_AARCH64_SVE_ASM */
170

source code of glibc/sysdeps/aarch64/multiarch/memset_a64fx.S