1 | /* Placeholder function, not used by any processor at the moment. |
2 | Copyright (C) 2022-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #ifndef STRNLEN |
20 | #define STRNLEN __strnlen_evex512 |
21 | #endif |
22 | |
23 | #include "x86-evex512-vecs.h" |
24 | #include "reg-macros.h" |
25 | |
26 | #include <isa-level.h> |
27 | |
28 | #if ISA_SHOULD_BUILD (4) |
29 | |
30 | # include <sysdep.h> |
31 | |
32 | # ifdef USE_AS_WCSLEN |
33 | # define VPCMPEQ vpcmpeqd |
34 | # define VPTESTN vptestnmd |
35 | # define VPMINU vpminud |
36 | # define CHAR_SIZE 4 |
37 | # else |
38 | # define VPCMPEQ vpcmpeqb |
39 | # define VPTESTN vptestnmb |
40 | # define VPMINU vpminub |
41 | # define CHAR_SIZE 1 |
42 | # endif |
43 | |
44 | # define PAGE_SIZE 4096 |
45 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
46 | |
47 | .section SECTION(.text),"ax" ,@progbits |
48 | /* Aligning entry point to 64 byte, provides better performance for |
49 | one vector length string. */ |
50 | ENTRY_P2ALIGN (STRNLEN, 6) |
51 | /* Check zero length. */ |
52 | test %RSI_LP, %RSI_LP |
53 | jz L(ret_max) |
54 | # ifdef __ILP32__ |
55 | /* Clear the upper 32 bits. */ |
56 | movl %esi, %esi |
57 | # endif |
58 | |
59 | movl %edi, %eax |
60 | vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) |
61 | sall $20, %eax |
62 | cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax |
63 | ja L(page_cross) |
64 | |
65 | /* Compare [w]char for null, mask bit will be set for match. */ |
66 | VPCMPEQ (%rdi), %VMM(0), %k0 |
67 | KMOV %k0, %VRCX |
68 | /* Store max length in rax. */ |
69 | mov %rsi, %rax |
70 | /* If rcx is 0, rax will have max length. We can not use VRCX |
71 | and VRAX here for evex256 because, upper 32 bits may be |
72 | undefined for ecx and eax. */ |
73 | bsfq %rcx, %rax |
74 | cmp $CHAR_PER_VEC, %rax |
75 | ja L(align_more) |
76 | cmpq %rax, %rsi |
77 | cmovb %esi, %eax |
78 | ret |
79 | |
80 | /* At this point vector max length reached. */ |
81 | .p2align 4,,3 |
82 | L(ret_max): |
83 | movq %rsi, %rax |
84 | ret |
85 | |
86 | L(align_more): |
87 | mov %rdi, %rax |
88 | /* Align rax to VEC_SIZE. */ |
89 | andq $-VEC_SIZE, %rax |
90 | movq %rdi, %rdx |
91 | subq %rax, %rdx |
92 | # ifdef USE_AS_WCSLEN |
93 | shr $2, %VRDX |
94 | # endif |
95 | /* At this point rdx contains [w]chars already compared. */ |
96 | leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx |
97 | /* At this point rdx contains number of w[char] needs to go. |
98 | Now onwards rdx will keep decrementing with each compare. */ |
99 | |
100 | /* Loop unroll 4 times for 4 vector loop. */ |
101 | VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 |
102 | subq $-VEC_SIZE, %rax |
103 | KMOV %k0, %VRCX |
104 | test %VRCX, %VRCX |
105 | jnz L(ret_vec_x1) |
106 | |
107 | subq $CHAR_PER_VEC, %rdx |
108 | jbe L(ret_max) |
109 | |
110 | VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 |
111 | KMOV %k0, %VRCX |
112 | test %VRCX, %VRCX |
113 | jnz L(ret_vec_x2) |
114 | |
115 | subq $CHAR_PER_VEC, %rdx |
116 | jbe L(ret_max) |
117 | |
118 | VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 |
119 | KMOV %k0, %VRCX |
120 | test %VRCX, %VRCX |
121 | jnz L(ret_vec_x3) |
122 | |
123 | subq $CHAR_PER_VEC, %rdx |
124 | jbe L(ret_max) |
125 | |
126 | VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 |
127 | KMOV %k0, %VRCX |
128 | test %VRCX, %VRCX |
129 | jnz L(ret_vec_x4) |
130 | |
131 | subq $CHAR_PER_VEC, %rdx |
132 | jbe L(ret_max) |
133 | /* Save pointer before 4 x VEC_SIZE alignment. */ |
134 | movq %rax, %rcx |
135 | |
136 | /* Align address to VEC_SIZE * 4 for loop. */ |
137 | andq $-(VEC_SIZE * 4), %rax |
138 | |
139 | subq %rax, %rcx |
140 | # ifdef USE_AS_WCSLEN |
141 | shr $2, %VRCX |
142 | # endif |
143 | /* rcx contains number of [w]char will be recompared due to |
144 | alignment fixes. rdx must be incremented by rcx to offset |
145 | alignment adjustment. */ |
146 | addq %rcx, %rdx |
147 | /* Need jump as we don't want to add/subtract rdx for first |
148 | iteration of 4 x VEC_SIZE aligned loop. */ |
149 | |
150 | .p2align 4,,11 |
151 | L(loop): |
152 | /* VPMINU and VPCMP combination provide better performance as |
153 | compared to alternative combinations. */ |
154 | VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) |
155 | VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) |
156 | VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) |
157 | VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) |
158 | |
159 | VPTESTN %VMM(2), %VMM(2), %k0 |
160 | VPTESTN %VMM(4), %VMM(4), %k1 |
161 | |
162 | subq $-(VEC_SIZE * 4), %rax |
163 | KORTEST %k0, %k1 |
164 | |
165 | jnz L(loopend) |
166 | subq $(CHAR_PER_VEC * 4), %rdx |
167 | ja L(loop) |
168 | mov %rsi, %rax |
169 | ret |
170 | |
171 | L(loopend): |
172 | |
173 | VPTESTN %VMM(1), %VMM(1), %k2 |
174 | KMOV %k2, %VRCX |
175 | test %VRCX, %VRCX |
176 | jnz L(ret_vec_x1) |
177 | |
178 | KMOV %k0, %VRCX |
179 | /* At this point, if k0 is non zero, null char must be in the |
180 | second vector. */ |
181 | test %VRCX, %VRCX |
182 | jnz L(ret_vec_x2) |
183 | |
184 | VPTESTN %VMM(3), %VMM(3), %k3 |
185 | KMOV %k3, %VRCX |
186 | test %VRCX, %VRCX |
187 | jnz L(ret_vec_x3) |
188 | /* At this point null [w]char must be in the fourth vector so no |
189 | need to check. */ |
190 | KMOV %k1, %VRCX |
191 | |
192 | /* Fourth, third, second vector terminating are pretty much |
193 | same, implemented this way to avoid branching and reuse code |
194 | from pre loop exit condition. */ |
195 | L(ret_vec_x4): |
196 | bsf %VRCX, %VRCX |
197 | subq %rdi, %rax |
198 | # ifdef USE_AS_WCSLEN |
199 | subq $-(VEC_SIZE * 3), %rax |
200 | shrq $2, %rax |
201 | addq %rcx, %rax |
202 | # else |
203 | leaq (VEC_SIZE * 3)(%rcx, %rax), %rax |
204 | # endif |
205 | |
206 | cmpq %rsi, %rax |
207 | cmovnb %rsi, %rax |
208 | ret |
209 | |
210 | L(ret_vec_x3): |
211 | bsf %VRCX, %VRCX |
212 | subq %rdi, %rax |
213 | # ifdef USE_AS_WCSLEN |
214 | subq $-(VEC_SIZE * 2), %rax |
215 | shrq $2, %rax |
216 | addq %rcx, %rax |
217 | # else |
218 | leaq (VEC_SIZE * 2)(%rcx, %rax), %rax |
219 | # endif |
220 | cmpq %rsi, %rax |
221 | cmovnb %rsi, %rax |
222 | ret |
223 | |
224 | L(ret_vec_x2): |
225 | subq $-VEC_SIZE, %rax |
226 | L(ret_vec_x1): |
227 | bsf %VRCX, %VRCX |
228 | subq %rdi, %rax |
229 | # ifdef USE_AS_WCSLEN |
230 | shrq $2, %rax |
231 | # endif |
232 | addq %rcx, %rax |
233 | cmpq %rsi, %rax |
234 | cmovnb %rsi, %rax |
235 | ret |
236 | |
237 | L(page_cross): |
238 | mov %rdi, %rax |
239 | movl %edi, %ecx |
240 | andl $(VEC_SIZE - 1), %ecx |
241 | # ifdef USE_AS_WCSLEN |
242 | sarl $2, %ecx |
243 | # endif |
244 | /* ecx contains number of w[char] to be skipped as a result |
245 | of address alignment. */ |
246 | andq $-VEC_SIZE, %rax |
247 | VPCMPEQ (%rax), %VMM(0), %k0 |
248 | KMOV %k0, %VRDX |
249 | /* Ignore number of character for alignment adjustment. */ |
250 | shr %cl, %VRDX |
251 | jnz L(page_cross_end) |
252 | movl $CHAR_PER_VEC, %eax |
253 | sub %ecx, %eax |
254 | cmp %rax, %rsi |
255 | ja L(align_more) |
256 | |
257 | L(page_cross_end): |
258 | bsf %VRDX, %VRAX |
259 | cmpq %rsi, %rax |
260 | cmovnb %esi, %eax |
261 | ret |
262 | |
263 | END (STRNLEN) |
264 | #endif |
265 | |