1 | /* Optimized memset for Fujitsu A64FX processor. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library. If not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | #include <sysdeps/aarch64/memset-reg.h> |
22 | |
23 | /* Assumptions: |
24 | * |
25 | * ARMv8.2-a, AArch64, unaligned accesses, sve |
26 | * |
27 | */ |
28 | |
29 | #define L1_SIZE (64*1024) // L1 64KB |
30 | #define L2_SIZE (8*1024*1024) // L2 8MB |
31 | #define CACHE_LINE_SIZE 256 |
32 | #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 |
33 | #define vector_length x9 |
34 | |
35 | #if HAVE_AARCH64_SVE_ASM |
36 | |
37 | .arch armv8.2-a+sve |
38 | |
39 | .macro st1b_unroll first=0, last=7 |
40 | st1b z0.b, p0, [dst, \first, mul vl] |
41 | .if \last-\first |
42 | st1b_unroll "(\first+1)" , \last |
43 | .endif |
44 | .endm |
45 | |
46 | |
47 | #undef BTI_C |
48 | #define BTI_C |
49 | |
50 | ENTRY (__memset_a64fx) |
51 | PTR_ARG (0) |
52 | SIZE_ARG (2) |
53 | |
54 | cntb vector_length |
55 | dup z0.b, valw |
56 | whilelo p0.b, vector_length, count |
57 | b.last 1f |
58 | whilelo p1.b, xzr, count |
59 | st1b z0.b, p1, [dstin, 0, mul vl] |
60 | st1b z0.b, p0, [dstin, 1, mul vl] |
61 | ret |
62 | |
63 | // count >= vector_length * 2 |
64 | 1: cmp count, vector_length, lsl 2 |
65 | add dstend, dstin, count |
66 | b.hi 1f |
67 | st1b z0.b, p0, [dstin, 0, mul vl] |
68 | st1b z0.b, p0, [dstin, 1, mul vl] |
69 | st1b z0.b, p0, [dstend, -2, mul vl] |
70 | st1b z0.b, p0, [dstend, -1, mul vl] |
71 | ret |
72 | |
73 | // count > vector_length * 4 |
74 | 1: lsl tmp1, vector_length, 3 |
75 | cmp count, tmp1 |
76 | b.hi L(vl_agnostic) |
77 | st1b z0.b, p0, [dstin, 0, mul vl] |
78 | st1b z0.b, p0, [dstin, 1, mul vl] |
79 | st1b z0.b, p0, [dstin, 2, mul vl] |
80 | st1b z0.b, p0, [dstin, 3, mul vl] |
81 | st1b z0.b, p0, [dstend, -4, mul vl] |
82 | st1b z0.b, p0, [dstend, -3, mul vl] |
83 | st1b z0.b, p0, [dstend, -2, mul vl] |
84 | st1b z0.b, p0, [dstend, -1, mul vl] |
85 | ret |
86 | |
87 | .p2align 4 |
88 | L(vl_agnostic): // VL Agnostic |
89 | mov dst, dstin |
90 | cmp count, L1_SIZE |
91 | b.hi L(L1_prefetch) |
92 | |
93 | // count >= 8 * vector_length |
94 | L(unroll8): |
95 | sub count, count, tmp1 |
96 | .p2align 4 |
97 | // The 2 instructions at the beginning of the following loop, |
98 | // cmp and branch, are a workaround so as not to degrade at |
99 | // the peak performance 16KB. |
100 | // It is found heuristically and the branch condition, b.ne, |
101 | // is chosen intentionally never to jump. |
102 | 1: cmp xzr, xzr |
103 | b.ne 1b |
104 | st1b_unroll 0, 7 |
105 | add dst, dst, tmp1 |
106 | subs count, count, tmp1 |
107 | b.hi 1b |
108 | add count, count, tmp1 |
109 | |
110 | L(last): |
111 | cmp count, vector_length, lsl 1 |
112 | b.ls 2f |
113 | add tmp2, vector_length, vector_length, lsl 2 |
114 | cmp count, tmp2 |
115 | b.ls 5f |
116 | st1b z0.b, p0, [dstend, -8, mul vl] |
117 | st1b z0.b, p0, [dstend, -7, mul vl] |
118 | st1b z0.b, p0, [dstend, -6, mul vl] |
119 | 5: st1b z0.b, p0, [dstend, -5, mul vl] |
120 | st1b z0.b, p0, [dstend, -4, mul vl] |
121 | st1b z0.b, p0, [dstend, -3, mul vl] |
122 | 2: st1b z0.b, p0, [dstend, -2, mul vl] |
123 | st1b z0.b, p0, [dstend, -1, mul vl] |
124 | ret |
125 | |
126 | // count >= L1_SIZE |
127 | .p2align 3 |
128 | L(L1_prefetch): |
129 | cmp count, L2_SIZE |
130 | b.hs L(L2) |
131 | cmp vector_length, 64 |
132 | b.ne L(unroll8) |
133 | 1: st1b_unroll 0, 3 |
134 | prfm pstl1keep, [dst, PF_DIST_L1] |
135 | st1b_unroll 4, 7 |
136 | prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE] |
137 | add dst, dst, CACHE_LINE_SIZE * 2 |
138 | sub count, count, CACHE_LINE_SIZE * 2 |
139 | cmp count, PF_DIST_L1 |
140 | b.hs 1b |
141 | b L(unroll8) |
142 | |
143 | // count >= L2_SIZE |
144 | .p2align 3 |
145 | L(L2): |
146 | tst valw, 255 |
147 | b.ne L(unroll8) |
148 | // align dst to CACHE_LINE_SIZE byte boundary |
149 | and tmp2, dst, CACHE_LINE_SIZE - 1 |
150 | st1b z0.b, p0, [dst, 0, mul vl] |
151 | st1b z0.b, p0, [dst, 1, mul vl] |
152 | st1b z0.b, p0, [dst, 2, mul vl] |
153 | st1b z0.b, p0, [dst, 3, mul vl] |
154 | sub dst, dst, tmp2 |
155 | add count, count, tmp2 |
156 | |
157 | // clear cachelines using DC ZVA |
158 | sub count, count, CACHE_LINE_SIZE * 2 |
159 | .p2align 4 |
160 | 1: add dst, dst, CACHE_LINE_SIZE |
161 | dc zva, dst |
162 | subs count, count, CACHE_LINE_SIZE |
163 | b.hi 1b |
164 | add count, count, CACHE_LINE_SIZE |
165 | b L(last) |
166 | |
167 | END (__memset_a64fx) |
168 | |
169 | #endif /* HAVE_AARCH64_SVE_ASM */ |
170 | |