1/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24# ifndef VEC_SIZE
25# include "x86-evex256-vecs.h"
26# endif
27
28
29# ifndef STRNLEN
30# define STRNLEN __strnlen_evex
31# endif
32
33# ifdef USE_AS_WCSLEN
34# define VPCMPEQ vpcmpeqd
35# define VPCMPNEQ vpcmpneqd
36# define VPTESTN vptestnmd
37# define VPTEST vptestmd
38# define VPMINU vpminud
39# define CHAR_SIZE 4
40
41# else
42# define VPCMPEQ vpcmpeqb
43# define VPCMPNEQ vpcmpneqb
44# define VPTESTN vptestnmb
45# define VPTEST vptestmb
46# define VPMINU vpminub
47# define CHAR_SIZE 1
48
49# define REG_WIDTH VEC_SIZE
50# endif
51
52# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
53
54# include "reg-macros.h"
55
56# if CHAR_PER_VEC == 32
57# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
58# else
59# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
60# endif
61
62
63
64# if CHAR_PER_VEC == 64
65# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
66# else
67# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
68# endif
69
70
71# define XZERO VMM_128(0)
72# define VZERO VMM(0)
73# define PAGE_SIZE 4096
74
75 .section SECTION(.text), "ax", @progbits
76ENTRY_P2ALIGN (STRNLEN, 6)
77 /* Check zero length. */
78 test %RSI_LP, %RSI_LP
79 jz L(zero)
80# ifdef __ILP32__
81 /* Clear the upper 32 bits. */
82 movl %esi, %esi
83# endif
84
85 movl %edi, %eax
86 vpxorq %XZERO, %XZERO, %XZERO
87 andl $(PAGE_SIZE - 1), %eax
88 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
89 ja L(cross_page_boundary)
90
91 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
92 null byte. */
93 VPCMPEQ (%rdi), %VZERO, %k0
94
95 KMOV %k0, %VRCX
96 movq %rsi, %rax
97
98 /* If src (rcx) is zero, bsf does not change the result. NB:
99 Must use 64-bit bsf here so that upper bits of len are not
100 cleared. */
101 bsfq %rcx, %rax
102 /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
103 CHAR) and rsi must be > CHAR_PER_VEC. */
104 cmpq $CHAR_PER_VEC, %rax
105 ja L(more_1x_vec)
106 /* Check if first match in bounds. */
107 cmpq %rax, %rsi
108 cmovb %esi, %eax
109 ret
110
111
112# if CHAR_PER_VEC != 32
113 .p2align 4,, 2
114L(zero):
115L(max_0):
116 movl %esi, %eax
117 ret
118# endif
119
120 /* Aligned more for strnlen compares remaining length vs 2 *
121 CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
122 going to the loop. */
123 .p2align 4,, 10
124L(more_1x_vec):
125L(cross_page_continue):
126 /* Compute number of words checked after aligning. */
127# ifdef USE_AS_WCSLEN
128 /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
129 overflow. */
130 movq %rdi, %rax
131 andq $(VEC_SIZE * -1), %rdi
132 subq %rdi, %rax
133 sarq $2, %rax
134 leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
135# else
136 leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
137 andq $(VEC_SIZE * -1), %rdi
138 subq %rdi, %rax
139# endif
140
141
142 VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
143
144 cmpq $(CHAR_PER_VEC * 2), %rax
145 ja L(more_2x_vec)
146
147L(last_2x_vec_or_less):
148 KMOV %k0, %VRDX
149 test %VRDX, %VRDX
150 jnz L(last_vec_check)
151
152 /* Check the end of data. */
153 SUB_SHORT (CHAR_PER_VEC, rax)
154 jbe L(max_0)
155 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
156 KMOV %k0, %VRDX
157 test %VRDX, %VRDX
158 jz L(max_0)
159 /* Best place for LAST_VEC_CHECK if ZMM. */
160 .p2align 4,, 8
161L(last_vec_check):
162 bsf %VRDX, %VRDX
163 sub %eax, %edx
164 lea (%rsi, %rdx), %eax
165 cmovae %esi, %eax
166 ret
167
168# if CHAR_PER_VEC == 32
169 .p2align 4,, 2
170L(zero):
171L(max_0):
172 movl %esi, %eax
173 ret
174# endif
175
176 .p2align 4,, 8
177L(last_4x_vec_or_less):
178 addl $(CHAR_PER_VEC * -4), %eax
179 VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
180 subq $(VEC_SIZE * -4), %rdi
181 cmpl $(CHAR_PER_VEC * 2), %eax
182 jbe L(last_2x_vec_or_less)
183
184 .p2align 4,, 6
185L(more_2x_vec):
186 /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
187 rechecking bounds. */
188
189 KMOV %k0, %VRDX
190
191 test %VRDX, %VRDX
192 jnz L(first_vec_x1)
193
194 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
195 KMOV %k0, %VRDX
196 test %VRDX, %VRDX
197 jnz L(first_vec_x2)
198
199 cmpq $(CHAR_PER_VEC * 4), %rax
200 ja L(more_4x_vec)
201
202
203 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
204 KMOV %k0, %VRDX
205 addl $(CHAR_PER_VEC * -2), %eax
206 test %VRDX, %VRDX
207 jnz L(last_vec_check)
208
209 subl $(CHAR_PER_VEC), %eax
210 jbe L(max_1)
211
212 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
213 KMOV %k0, %VRDX
214
215 test %VRDX, %VRDX
216 jnz L(last_vec_check)
217L(max_1):
218 movl %esi, %eax
219 ret
220
221 .p2align 4,, 3
222L(first_vec_x2):
223# if VEC_SIZE == 64
224 /* If VEC_SIZE == 64 we can fit logic for full return label in
225 spare bytes before next cache line. */
226 bsf %VRDX, %VRDX
227 sub %eax, %esi
228 leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
229 ret
230 .p2align 4,, 6
231# else
232 addl $CHAR_PER_VEC, %esi
233# endif
234L(first_vec_x1):
235 bsf %VRDX, %VRDX
236 sub %eax, %esi
237 leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
238 ret
239
240
241 .p2align 4,, 6
242L(first_vec_x4):
243# if VEC_SIZE == 64
244 /* If VEC_SIZE == 64 we can fit logic for full return label in
245 spare bytes before next cache line. */
246 bsf %VRDX, %VRDX
247 sub %eax, %esi
248 leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
249 ret
250 .p2align 4,, 6
251# else
252 addl $CHAR_PER_VEC, %esi
253# endif
254L(first_vec_x3):
255 bsf %VRDX, %VRDX
256 sub %eax, %esi
257 leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
258 ret
259
260 .p2align 4,, 5
261L(more_4x_vec):
262 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
263 KMOV %k0, %VRDX
264 test %VRDX, %VRDX
265 jnz L(first_vec_x3)
266
267 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
268 KMOV %k0, %VRDX
269 test %VRDX, %VRDX
270 jnz L(first_vec_x4)
271
272 /* Check if at last VEC_SIZE * 4 length before aligning for the
273 loop. */
274 cmpq $(CHAR_PER_VEC * 8), %rax
275 jbe L(last_4x_vec_or_less)
276
277
278 /* Compute number of words checked after aligning. */
279# ifdef USE_AS_WCSLEN
280 /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
281 overflow. */
282 leaq (VEC_SIZE * -3)(%rdi), %rdx
283# else
284 leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
285# endif
286
287 subq $(VEC_SIZE * -1), %rdi
288
289 /* Align data to VEC_SIZE * 4. */
290# if VEC_SIZE == 64
291 /* Saves code size. No evex512 processor has partial register
292 stalls. If that change this can be replaced with `andq
293 $-(VEC_SIZE * 4), %rdi`. */
294 xorb %dil, %dil
295# else
296 andq $-(VEC_SIZE * 4), %rdi
297# endif
298
299# ifdef USE_AS_WCSLEN
300 subq %rdi, %rdx
301 sarq $2, %rdx
302 addq %rdx, %rax
303# else
304 subq %rdi, %rax
305# endif
306 /* Compare 4 * VEC at a time forward. */
307 .p2align 4,, 11
308L(loop_4x_vec):
309 VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
310 VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
311 VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
312 VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
313 VPTESTN %VMM(2), %VMM(2), %k0
314 VPTESTN %VMM(4), %VMM(4), %k2
315 subq $-(VEC_SIZE * 4), %rdi
316 /* Break if at end of length. */
317 subq $(CHAR_PER_VEC * 4), %rax
318 jbe L(loop_len_end)
319
320
321 KORTEST %k0, %k2
322 jz L(loop_4x_vec)
323
324
325L(loop_last_4x_vec):
326 movq %rsi, %rcx
327 subq %rax, %rsi
328 VPTESTN %VMM(1), %VMM(1), %k1
329 KMOV %k1, %VRDX
330 test %VRDX, %VRDX
331 jnz L(last_vec_x0)
332
333 KMOV %k0, %VRDX
334 test %VRDX, %VRDX
335 jnz L(last_vec_x1)
336
337 VPTESTN %VMM(3), %VMM(3), %k0
338
339 /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
340 returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
341 individually, for VEC_SIZE == 32 we combine them in a single
342 64-bit GPR. */
343# if CHAR_PER_VEC == 64
344 KMOV %k0, %VRDX
345 test %VRDX, %VRDX
346 jnz L(last_vec_x2)
347 KMOV %k2, %VRDX
348# else
349 /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
350 */
351 kmovd %k2, %edx
352 kmovd %k0, %eax
353 salq $CHAR_PER_VEC, %rdx
354 orq %rax, %rdx
355# endif
356
357 /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
358 */
359 bsfq %rdx, %rdx
360 leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
361 cmpq %rax, %rcx
362 cmovb %rcx, %rax
363 ret
364
365 /* Handle last 4x VEC after loop. All VECs have been loaded. */
366 .p2align 4,, 4
367L(loop_len_end):
368 KORTEST %k0, %k2
369 jnz L(loop_last_4x_vec)
370 movq %rsi, %rax
371 ret
372
373
374# if CHAR_PER_VEC == 64
375 /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
376 need return label for it. */
377 .p2align 4,, 8
378L(last_vec_x2):
379 bsf %VRDX, %VRDX
380 leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
381 cmpq %rax, %rcx
382 cmovb %rcx, %rax
383 ret
384# endif
385
386
387 .p2align 4,, 10
388L(last_vec_x1):
389 addq $CHAR_PER_VEC, %rsi
390L(last_vec_x0):
391 bsf %VRDX, %VRDX
392 leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
393 cmpq %rax, %rcx
394 cmovb %rcx, %rax
395 ret
396
397
398 .p2align 4,, 8
399L(cross_page_boundary):
400 /* Align data to VEC_SIZE. */
401 movq %rdi, %rcx
402 andq $-VEC_SIZE, %rcx
403 VPCMPEQ (%rcx), %VZERO, %k0
404
405 KMOV %k0, %VRCX
406# ifdef USE_AS_WCSLEN
407 shrl $2, %eax
408 andl $(CHAR_PER_VEC - 1), %eax
409# endif
410 shrx %VRAX, %VRCX, %VRCX
411
412 negl %eax
413 andl $(CHAR_PER_VEC - 1), %eax
414 movq %rsi, %rdx
415 bsf %VRCX, %VRDX
416 cmpq %rax, %rdx
417 ja L(cross_page_continue)
418 movl %edx, %eax
419 cmpq %rdx, %rsi
420 cmovb %esi, %eax
421 ret
422END (STRNLEN)
423#endif
424

source code of glibc/sysdeps/x86_64/multiarch/strnlen-evex.S