1/* Optimized memset for Fujitsu A64FX processor.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21#include <sysdeps/aarch64/memset-reg.h>
22
23/* Assumptions:
24 *
25 * ARMv8.2-a, AArch64, unaligned accesses, sve
26 *
27 */
28
29#define L1_SIZE (64*1024) // L1 64KB
30#define L2_SIZE (8*1024*1024) // L2 8MB
31#define CACHE_LINE_SIZE 256
32#define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
33#define vector_length x9
34
35#if HAVE_AARCH64_SVE_ASM
36
37 .arch armv8.2-a+sve
38
39 .macro st1b_unroll first=0, last=7
40 st1b z0.b, p0, [dst, \first, mul vl]
41 .if \last-\first
42 st1b_unroll "(\first+1)", \last
43 .endif
44 .endm
45
46
47#undef BTI_C
48#define BTI_C
49
50ENTRY (__memset_a64fx)
51 PTR_ARG (0)
52 SIZE_ARG (2)
53
54 cntb vector_length
55 dup z0.b, valw
56 whilelo p0.b, vector_length, count
57 b.last 1f
58 whilelo p1.b, xzr, count
59 st1b z0.b, p1, [dstin, 0, mul vl]
60 st1b z0.b, p0, [dstin, 1, mul vl]
61 ret
62
63 // count >= vector_length * 2
641: cmp count, vector_length, lsl 2
65 add dstend, dstin, count
66 b.hi 1f
67 st1b z0.b, p0, [dstin, 0, mul vl]
68 st1b z0.b, p0, [dstin, 1, mul vl]
69 st1b z0.b, p0, [dstend, -2, mul vl]
70 st1b z0.b, p0, [dstend, -1, mul vl]
71 ret
72
73 // count > vector_length * 4
741: lsl tmp1, vector_length, 3
75 cmp count, tmp1
76 b.hi L(vl_agnostic)
77 st1b z0.b, p0, [dstin, 0, mul vl]
78 st1b z0.b, p0, [dstin, 1, mul vl]
79 st1b z0.b, p0, [dstin, 2, mul vl]
80 st1b z0.b, p0, [dstin, 3, mul vl]
81 st1b z0.b, p0, [dstend, -4, mul vl]
82 st1b z0.b, p0, [dstend, -3, mul vl]
83 st1b z0.b, p0, [dstend, -2, mul vl]
84 st1b z0.b, p0, [dstend, -1, mul vl]
85 ret
86
87 .p2align 4
88L(vl_agnostic): // VL Agnostic
89 mov dst, dstin
90 cmp count, L1_SIZE
91 b.hi L(L1_prefetch)
92
93 // count >= 8 * vector_length
94L(unroll8):
95 sub count, count, tmp1
96 .p2align 4
97 // The 2 instructions at the beginning of the following loop,
98 // cmp and branch, are a workaround so as not to degrade at
99 // the peak performance 16KB.
100 // It is found heuristically and the branch condition, b.ne,
101 // is chosen intentionally never to jump.
1021: cmp xzr, xzr
103 b.ne 1b
104 st1b_unroll 0, 7
105 add dst, dst, tmp1
106 subs count, count, tmp1
107 b.hi 1b
108 add count, count, tmp1
109
110L(last):
111 cmp count, vector_length, lsl 1
112 b.ls 2f
113 add tmp2, vector_length, vector_length, lsl 2
114 cmp count, tmp2
115 b.ls 5f
116 st1b z0.b, p0, [dstend, -8, mul vl]
117 st1b z0.b, p0, [dstend, -7, mul vl]
118 st1b z0.b, p0, [dstend, -6, mul vl]
1195: st1b z0.b, p0, [dstend, -5, mul vl]
120 st1b z0.b, p0, [dstend, -4, mul vl]
121 st1b z0.b, p0, [dstend, -3, mul vl]
1222: st1b z0.b, p0, [dstend, -2, mul vl]
123 st1b z0.b, p0, [dstend, -1, mul vl]
124 ret
125
126 // count >= L1_SIZE
127 .p2align 3
128L(L1_prefetch):
129 cmp count, L2_SIZE
130 b.hs L(L2)
131 cmp vector_length, 64
132 b.ne L(unroll8)
1331: st1b_unroll 0, 3
134 prfm pstl1keep, [dst, PF_DIST_L1]
135 st1b_unroll 4, 7
136 prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
137 add dst, dst, CACHE_LINE_SIZE * 2
138 sub count, count, CACHE_LINE_SIZE * 2
139 cmp count, PF_DIST_L1
140 b.hs 1b
141 b L(unroll8)
142
143 // count >= L2_SIZE
144 .p2align 3
145L(L2):
146 tst valw, 255
147 b.ne L(unroll8)
148 // align dst to CACHE_LINE_SIZE byte boundary
149 and tmp2, dst, CACHE_LINE_SIZE - 1
150 st1b z0.b, p0, [dst, 0, mul vl]
151 st1b z0.b, p0, [dst, 1, mul vl]
152 st1b z0.b, p0, [dst, 2, mul vl]
153 st1b z0.b, p0, [dst, 3, mul vl]
154 sub dst, dst, tmp2
155 add count, count, tmp2
156
157 // clear cachelines using DC ZVA
158 sub count, count, CACHE_LINE_SIZE * 2
159 .p2align 4
1601: add dst, dst, CACHE_LINE_SIZE
161 dc zva, dst
162 subs count, count, CACHE_LINE_SIZE
163 b.hi 1b
164 add count, count, CACHE_LINE_SIZE
165 b L(last)
166
167END (__memset_a64fx)
168
169#endif /* HAVE_AARCH64_SVE_ASM */
170

source code of glibc/sysdeps/aarch64/multiarch/memset_a64fx.S