1/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRLEN
24# define STRLEN __strlen_avx2
25# endif
26
27# ifdef USE_AS_WCSLEN
28# define VPCMPEQ vpcmpeqd
29# define VPMINU vpminud
30# define CHAR_SIZE 4
31# else
32# define VPCMPEQ vpcmpeqb
33# define VPMINU vpminub
34# define CHAR_SIZE 1
35# endif
36
37# ifndef VZEROUPPER
38# define VZEROUPPER vzeroupper
39# endif
40
41# ifndef SECTION
42# define SECTION(p) p##.avx
43# endif
44
45# define VEC_SIZE 32
46# define PAGE_SIZE 4096
47# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
48
49 .section SECTION(.text),"ax",@progbits
50ENTRY (STRLEN)
51# ifdef USE_AS_STRNLEN
52 /* Check zero length. */
53# ifdef __ILP32__
54 /* Clear upper bits. */
55 and %RSI_LP, %RSI_LP
56# else
57 test %RSI_LP, %RSI_LP
58# endif
59 jz L(zero)
60 /* Store max len in R8_LP before adjusting if using WCSLEN. */
61 mov %RSI_LP, %R8_LP
62# endif
63 movl %edi, %eax
64 movq %rdi, %rdx
65 vpxor %xmm0, %xmm0, %xmm0
66 /* Clear high bits from edi. Only keeping bits relevant to page
67 cross check. */
68 andl $(PAGE_SIZE - 1), %eax
69 /* Check if we may cross page boundary with one vector load. */
70 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
71 ja L(cross_page_boundary)
72
73 /* Check the first VEC_SIZE bytes. */
74 VPCMPEQ (%rdi), %ymm0, %ymm1
75 vpmovmskb %ymm1, %eax
76# ifdef USE_AS_STRNLEN
77 /* If length < VEC_SIZE handle special. */
78 cmpq $CHAR_PER_VEC, %rsi
79 jbe L(first_vec_x0)
80# endif
81 /* If empty continue to aligned_more. Otherwise return bit
82 position of first match. */
83 testl %eax, %eax
84 jz L(aligned_more)
85 tzcntl %eax, %eax
86# ifdef USE_AS_WCSLEN
87 /* NB: Divide bytes by 4 to get wchar_t count. */
88 shrl $2, %eax
89# endif
90 VZEROUPPER_RETURN
91
92# ifdef USE_AS_STRNLEN
93L(zero):
94 xorl %eax, %eax
95 ret
96
97 .p2align 4
98L(first_vec_x0):
99 /* Set bit for max len so that tzcnt will return min of max len
100 and position of first match. */
101# ifdef USE_AS_WCSLEN
102 /* NB: Multiply length by 4 to get byte count. */
103 sall $2, %esi
104# endif
105 btsq %rsi, %rax
106 tzcntl %eax, %eax
107# ifdef USE_AS_WCSLEN
108 /* NB: Divide bytes by 4 to get wchar_t count. */
109 shrl $2, %eax
110# endif
111 VZEROUPPER_RETURN
112# endif
113
114 .p2align 4
115L(first_vec_x1):
116 tzcntl %eax, %eax
117 /* Safe to use 32 bit instructions as these are only called for
118 size = [1, 159]. */
119# ifdef USE_AS_STRNLEN
120 /* Use ecx which was computed earlier to compute correct value.
121 */
122# ifdef USE_AS_WCSLEN
123 leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
124# else
125 subl $(VEC_SIZE * 4 + 1), %ecx
126 addl %ecx, %eax
127# endif
128# else
129 subl %edx, %edi
130 incl %edi
131 addl %edi, %eax
132# endif
133# ifdef USE_AS_WCSLEN
134 /* NB: Divide bytes by 4 to get wchar_t count. */
135 shrl $2, %eax
136# endif
137 VZEROUPPER_RETURN
138
139 .p2align 4
140L(first_vec_x2):
141 tzcntl %eax, %eax
142 /* Safe to use 32 bit instructions as these are only called for
143 size = [1, 159]. */
144# ifdef USE_AS_STRNLEN
145 /* Use ecx which was computed earlier to compute correct value.
146 */
147# ifdef USE_AS_WCSLEN
148 leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
149# else
150 subl $(VEC_SIZE * 3 + 1), %ecx
151 addl %ecx, %eax
152# endif
153# else
154 subl %edx, %edi
155 addl $(VEC_SIZE + 1), %edi
156 addl %edi, %eax
157# endif
158# ifdef USE_AS_WCSLEN
159 /* NB: Divide bytes by 4 to get wchar_t count. */
160 shrl $2, %eax
161# endif
162 VZEROUPPER_RETURN
163
164 .p2align 4
165L(first_vec_x3):
166 tzcntl %eax, %eax
167 /* Safe to use 32 bit instructions as these are only called for
168 size = [1, 159]. */
169# ifdef USE_AS_STRNLEN
170 /* Use ecx which was computed earlier to compute correct value.
171 */
172# ifdef USE_AS_WCSLEN
173 leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
174# else
175 subl $(VEC_SIZE * 2 + 1), %ecx
176 addl %ecx, %eax
177# endif
178# else
179 subl %edx, %edi
180 addl $(VEC_SIZE * 2 + 1), %edi
181 addl %edi, %eax
182# endif
183# ifdef USE_AS_WCSLEN
184 /* NB: Divide bytes by 4 to get wchar_t count. */
185 shrl $2, %eax
186# endif
187 VZEROUPPER_RETURN
188
189 .p2align 4
190L(first_vec_x4):
191 tzcntl %eax, %eax
192 /* Safe to use 32 bit instructions as these are only called for
193 size = [1, 159]. */
194# ifdef USE_AS_STRNLEN
195 /* Use ecx which was computed earlier to compute correct value.
196 */
197# ifdef USE_AS_WCSLEN
198 leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
199# else
200 subl $(VEC_SIZE + 1), %ecx
201 addl %ecx, %eax
202# endif
203# else
204 subl %edx, %edi
205 addl $(VEC_SIZE * 3 + 1), %edi
206 addl %edi, %eax
207# endif
208# ifdef USE_AS_WCSLEN
209 /* NB: Divide bytes by 4 to get wchar_t count. */
210 shrl $2, %eax
211# endif
212 VZEROUPPER_RETURN
213
214 .p2align 5
215L(aligned_more):
216 /* Align data to VEC_SIZE - 1. This is the same number of
217 instructions as using andq with -VEC_SIZE but saves 4 bytes of
218 code on the x4 check. */
219 orq $(VEC_SIZE - 1), %rdi
220L(cross_page_continue):
221 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
222 since data is only aligned to VEC_SIZE. */
223# ifdef USE_AS_STRNLEN
224 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
225 because it simplies the logic in last_4x_vec_or_less. */
226 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
227 subq %rdx, %rcx
228# ifdef USE_AS_WCSLEN
229 /* NB: Divide bytes by 4 to get the wchar_t count. */
230 sarl $2, %ecx
231# endif
232# endif
233 /* Load first VEC regardless. */
234 VPCMPEQ 1(%rdi), %ymm0, %ymm1
235# ifdef USE_AS_STRNLEN
236 /* Adjust length. If near end handle specially. */
237 subq %rcx, %rsi
238 jb L(last_4x_vec_or_less)
239# endif
240 vpmovmskb %ymm1, %eax
241 testl %eax, %eax
242 jnz L(first_vec_x1)
243
244 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
245 vpmovmskb %ymm1, %eax
246 testl %eax, %eax
247 jnz L(first_vec_x2)
248
249 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
250 vpmovmskb %ymm1, %eax
251 testl %eax, %eax
252 jnz L(first_vec_x3)
253
254 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
255 vpmovmskb %ymm1, %eax
256 testl %eax, %eax
257 jnz L(first_vec_x4)
258
259 /* Align data to VEC_SIZE * 4 - 1. */
260# ifdef USE_AS_STRNLEN
261 /* Before adjusting length check if at last VEC_SIZE * 4. */
262 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
263 jbe L(last_4x_vec_or_less_load)
264 incq %rdi
265 movl %edi, %ecx
266 orq $(VEC_SIZE * 4 - 1), %rdi
267 andl $(VEC_SIZE * 4 - 1), %ecx
268# ifdef USE_AS_WCSLEN
269 /* NB: Divide bytes by 4 to get the wchar_t count. */
270 sarl $2, %ecx
271# endif
272 /* Readjust length. */
273 addq %rcx, %rsi
274# else
275 incq %rdi
276 orq $(VEC_SIZE * 4 - 1), %rdi
277# endif
278 /* Compare 4 * VEC at a time forward. */
279 .p2align 4
280L(loop_4x_vec):
281# ifdef USE_AS_STRNLEN
282 /* Break if at end of length. */
283 subq $(CHAR_PER_VEC * 4), %rsi
284 jb L(last_4x_vec_or_less_cmpeq)
285# endif
286 /* Save some code size by microfusing VPMINU with the load.
287 Since the matches in ymm2/ymm4 can only be returned if there
288 where no matches in ymm1/ymm3 respectively there is no issue
289 with overlap. */
290 vmovdqa 1(%rdi), %ymm1
291 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
292 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
293 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
294
295 VPMINU %ymm2, %ymm4, %ymm5
296 VPCMPEQ %ymm5, %ymm0, %ymm5
297 vpmovmskb %ymm5, %ecx
298
299 subq $-(VEC_SIZE * 4), %rdi
300 testl %ecx, %ecx
301 jz L(loop_4x_vec)
302
303
304 VPCMPEQ %ymm1, %ymm0, %ymm1
305 vpmovmskb %ymm1, %eax
306 subq %rdx, %rdi
307 testl %eax, %eax
308 jnz L(last_vec_return_x0)
309
310 VPCMPEQ %ymm2, %ymm0, %ymm2
311 vpmovmskb %ymm2, %eax
312 testl %eax, %eax
313 jnz L(last_vec_return_x1)
314
315 /* Combine last 2 VEC. */
316 VPCMPEQ %ymm3, %ymm0, %ymm3
317 vpmovmskb %ymm3, %eax
318 /* rcx has combined result from all 4 VEC. It will only be used
319 if the first 3 other VEC all did not contain a match. */
320 salq $32, %rcx
321 orq %rcx, %rax
322 tzcntq %rax, %rax
323 subq $(VEC_SIZE * 2 - 1), %rdi
324 addq %rdi, %rax
325# ifdef USE_AS_WCSLEN
326 /* NB: Divide bytes by 4 to get wchar_t count. */
327 shrq $2, %rax
328# endif
329 VZEROUPPER_RETURN
330
331
332# ifdef USE_AS_STRNLEN
333 .p2align 4
334L(last_4x_vec_or_less_load):
335 /* Depending on entry adjust rdi / prepare first VEC in ymm1.
336 */
337 subq $-(VEC_SIZE * 4), %rdi
338L(last_4x_vec_or_less_cmpeq):
339 VPCMPEQ 1(%rdi), %ymm0, %ymm1
340L(last_4x_vec_or_less):
341# ifdef USE_AS_WCSLEN
342 /* NB: Multiply length by 4 to get byte count. */
343 sall $2, %esi
344# endif
345 vpmovmskb %ymm1, %eax
346 /* If remaining length > VEC_SIZE * 2. This works if esi is off
347 by VEC_SIZE * 4. */
348 testl $(VEC_SIZE * 2), %esi
349 jnz L(last_4x_vec)
350
351 /* length may have been negative or positive by an offset of
352 VEC_SIZE * 4 depending on where this was called from. This fixes
353 that. */
354 andl $(VEC_SIZE * 4 - 1), %esi
355 testl %eax, %eax
356 jnz L(last_vec_x1_check)
357
358 subl $VEC_SIZE, %esi
359 jb L(max)
360
361 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
362 vpmovmskb %ymm1, %eax
363 tzcntl %eax, %eax
364 /* Check the end of data. */
365 cmpl %eax, %esi
366 jb L(max)
367 subq %rdx, %rdi
368 addl $(VEC_SIZE + 1), %eax
369 addq %rdi, %rax
370# ifdef USE_AS_WCSLEN
371 /* NB: Divide bytes by 4 to get wchar_t count. */
372 shrq $2, %rax
373# endif
374 VZEROUPPER_RETURN
375# endif
376
377 .p2align 4
378L(last_vec_return_x0):
379 tzcntl %eax, %eax
380 subq $(VEC_SIZE * 4 - 1), %rdi
381 addq %rdi, %rax
382# ifdef USE_AS_WCSLEN
383 /* NB: Divide bytes by 4 to get wchar_t count. */
384 shrq $2, %rax
385# endif
386 VZEROUPPER_RETURN
387
388 .p2align 4
389L(last_vec_return_x1):
390 tzcntl %eax, %eax
391 subq $(VEC_SIZE * 3 - 1), %rdi
392 addq %rdi, %rax
393# ifdef USE_AS_WCSLEN
394 /* NB: Divide bytes by 4 to get wchar_t count. */
395 shrq $2, %rax
396# endif
397 VZEROUPPER_RETURN
398
399# ifdef USE_AS_STRNLEN
400 .p2align 4
401L(last_vec_x1_check):
402
403 tzcntl %eax, %eax
404 /* Check the end of data. */
405 cmpl %eax, %esi
406 jb L(max)
407 subq %rdx, %rdi
408 incl %eax
409 addq %rdi, %rax
410# ifdef USE_AS_WCSLEN
411 /* NB: Divide bytes by 4 to get wchar_t count. */
412 shrq $2, %rax
413# endif
414 VZEROUPPER_RETURN
415
416L(max):
417 movq %r8, %rax
418 VZEROUPPER_RETURN
419
420 .p2align 4
421L(last_4x_vec):
422 /* Test first 2x VEC normally. */
423 testl %eax, %eax
424 jnz L(last_vec_x1)
425
426 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
427 vpmovmskb %ymm1, %eax
428 testl %eax, %eax
429 jnz L(last_vec_x2)
430
431 /* Normalize length. */
432 andl $(VEC_SIZE * 4 - 1), %esi
433 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
434 vpmovmskb %ymm1, %eax
435 testl %eax, %eax
436 jnz L(last_vec_x3)
437
438 subl $(VEC_SIZE * 3), %esi
439 jb L(max)
440
441 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
442 vpmovmskb %ymm1, %eax
443 tzcntl %eax, %eax
444 /* Check the end of data. */
445 cmpl %eax, %esi
446 jb L(max)
447 subq %rdx, %rdi
448 addl $(VEC_SIZE * 3 + 1), %eax
449 addq %rdi, %rax
450# ifdef USE_AS_WCSLEN
451 /* NB: Divide bytes by 4 to get wchar_t count. */
452 shrq $2, %rax
453# endif
454 VZEROUPPER_RETURN
455
456
457 .p2align 4
458L(last_vec_x1):
459 /* essentially duplicates of first_vec_x1 but use 64 bit
460 instructions. */
461 tzcntl %eax, %eax
462 subq %rdx, %rdi
463 incl %eax
464 addq %rdi, %rax
465# ifdef USE_AS_WCSLEN
466 /* NB: Divide bytes by 4 to get wchar_t count. */
467 shrq $2, %rax
468# endif
469 VZEROUPPER_RETURN
470
471 .p2align 4
472L(last_vec_x2):
473 /* essentially duplicates of first_vec_x1 but use 64 bit
474 instructions. */
475 tzcntl %eax, %eax
476 subq %rdx, %rdi
477 addl $(VEC_SIZE + 1), %eax
478 addq %rdi, %rax
479# ifdef USE_AS_WCSLEN
480 /* NB: Divide bytes by 4 to get wchar_t count. */
481 shrq $2, %rax
482# endif
483 VZEROUPPER_RETURN
484
485 .p2align 4
486L(last_vec_x3):
487 tzcntl %eax, %eax
488 subl $(VEC_SIZE * 2), %esi
489 /* Check the end of data. */
490 cmpl %eax, %esi
491 jb L(max_end)
492 subq %rdx, %rdi
493 addl $(VEC_SIZE * 2 + 1), %eax
494 addq %rdi, %rax
495# ifdef USE_AS_WCSLEN
496 /* NB: Divide bytes by 4 to get wchar_t count. */
497 shrq $2, %rax
498# endif
499 VZEROUPPER_RETURN
500L(max_end):
501 movq %r8, %rax
502 VZEROUPPER_RETURN
503# endif
504
505 /* Cold case for crossing page with first load. */
506 .p2align 4
507L(cross_page_boundary):
508 /* Align data to VEC_SIZE - 1. */
509 orq $(VEC_SIZE - 1), %rdi
510 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
511 vpmovmskb %ymm1, %eax
512 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
513 so no need to manually mod rdx. */
514 sarxl %edx, %eax, %eax
515# ifdef USE_AS_STRNLEN
516 testl %eax, %eax
517 jnz L(cross_page_less_vec)
518 leaq 1(%rdi), %rcx
519 subq %rdx, %rcx
520# ifdef USE_AS_WCSLEN
521 /* NB: Divide bytes by 4 to get wchar_t count. */
522 shrl $2, %ecx
523# endif
524 /* Check length. */
525 cmpq %rsi, %rcx
526 jb L(cross_page_continue)
527 movq %r8, %rax
528# else
529 testl %eax, %eax
530 jz L(cross_page_continue)
531 tzcntl %eax, %eax
532# ifdef USE_AS_WCSLEN
533 /* NB: Divide length by 4 to get wchar_t count. */
534 shrl $2, %eax
535# endif
536# endif
537L(return_vzeroupper):
538 ZERO_UPPER_VEC_REGISTERS_RETURN
539
540# ifdef USE_AS_STRNLEN
541 .p2align 4
542L(cross_page_less_vec):
543 tzcntl %eax, %eax
544# ifdef USE_AS_WCSLEN
545 /* NB: Divide by 4 to convert from byte-count to length. */
546 shrl $2, %eax
547# endif
548 cmpq %rax, %rsi
549 cmovb %esi, %eax
550 VZEROUPPER_RETURN
551# endif
552
553END (STRLEN)
554#endif
555

source code of glibc/sysdeps/x86_64/multiarch/strlen-avx2.S