1/* memchr/wmemchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24# ifndef VEC_SIZE
25# include "x86-evex256-vecs.h"
26# endif
27
28# ifndef MEMCHR
29# define MEMCHR __memchr_evex
30# endif
31
32# ifdef USE_AS_WMEMCHR
33# define PC_SHIFT_GPR rcx
34# define VPTESTN vptestnmd
35# define VPBROADCAST vpbroadcastd
36# define VPMINU vpminud
37# define VPCMP vpcmpd
38# define VPCMPEQ vpcmpeqd
39# define CHAR_SIZE 4
40
41# define USE_WIDE_CHAR
42# else
43# define PC_SHIFT_GPR rdi
44# define VPTESTN vptestnmb
45# define VPBROADCAST vpbroadcastb
46# define VPMINU vpminub
47# define VPCMP vpcmpb
48# define VPCMPEQ vpcmpeqb
49# define CHAR_SIZE 1
50# endif
51
52# include "reg-macros.h"
53
54
55/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
56 doesn't have VEX encoding), use VEX encoding in loop so we
57 can use vpcmpeqb + vptern which is more efficient than the
58 EVEX alternative. */
59# if defined USE_IN_RTM || VEC_SIZE == 64
60# undef COND_VZEROUPPER
61# undef VZEROUPPER_RETURN
62# undef VZEROUPPER
63
64# define COND_VZEROUPPER
65# define VZEROUPPER_RETURN ret
66# define VZEROUPPER
67
68# define USE_TERN_IN_LOOP 0
69# else
70# define USE_TERN_IN_LOOP 1
71# undef VZEROUPPER
72# define VZEROUPPER vzeroupper
73# endif
74
75# if USE_TERN_IN_LOOP
76 /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
77 so we don't want to multiply resulting index. */
78# define TERN_CHAR_MULT 1
79
80# ifdef USE_AS_WMEMCHR
81# define TEST_END() inc %VRCX
82# else
83# define TEST_END() add %rdx, %rcx
84# endif
85# else
86# define TERN_CHAR_MULT CHAR_SIZE
87# define TEST_END() KORTEST %k2, %k3
88# endif
89
90# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
91# ifndef USE_AS_WMEMCHR
92# define GPR_X0_IS_RET 1
93# else
94# define GPR_X0_IS_RET 0
95# endif
96# define GPR_X0 rax
97# else
98# define GPR_X0_IS_RET 0
99# define GPR_X0 rdx
100# endif
101
102# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
103
104# if CHAR_PER_VEC == 64
105# define LAST_VEC_OFFSET (VEC_SIZE * 3)
106# else
107# define LAST_VEC_OFFSET (VEC_SIZE * 2)
108# endif
109# if CHAR_PER_VEC >= 32
110# define MASK_GPR(...) VGPR(__VA_ARGS__)
111# elif CHAR_PER_VEC == 16
112# define MASK_GPR(reg) VGPR_SZ(reg, 16)
113# else
114# define MASK_GPR(reg) VGPR_SZ(reg, 8)
115# endif
116
117# define VMATCH VMM(0)
118# define VMATCH_LO VMM_lo(0)
119
120# define PAGE_SIZE 4096
121
122
123 .section SECTION(.text), "ax", @progbits
124ENTRY_P2ALIGN (MEMCHR, 6)
125 /* Check for zero length. */
126 test %RDX_LP, %RDX_LP
127 jz L(zero_0)
128
129# ifdef __ILP32__
130 /* Clear the upper 32 bits. */
131 movl %edx, %edx
132# endif
133 VPBROADCAST %esi, %VMATCH
134 /* Check if we may cross page boundary with one vector load. */
135 movl %edi, %eax
136 andl $(PAGE_SIZE - 1), %eax
137 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
138 ja L(page_cross)
139
140 VPCMPEQ (%rdi), %VMATCH, %k0
141 KMOV %k0, %VRAX
142# ifndef USE_AS_WMEMCHR
143 /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a
144 already a dependency between rcx and rsi so no worries about
145 false-dep here. */
146 tzcnt %VRAX, %VRSI
147 /* If rdx <= rsi then either 1) rcx was non-zero (there was a
148 match) but it was out of bounds or 2) rcx was zero and rdx
149 was <= VEC_SIZE so we are done scanning. */
150 cmpq %rsi, %rdx
151 /* NB: Use branch to return zero/non-zero. Common usage will
152 branch on result of function (if return is null/non-null).
153 This branch can be used to predict the ensuing one so there
154 is no reason to extend the data-dependency with cmovcc. */
155 jbe L(zero_0)
156
157 /* If rcx is zero then len must be > RDX, otherwise since we
158 already tested len vs lzcnt(rcx) (in rsi) we are good to
159 return this match. */
160 test %VRAX, %VRAX
161 jz L(more_1x_vec)
162 leaq (%rdi, %rsi), %rax
163# else
164
165 /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
166 > 1 so if rcx is tzcnt != CHAR_PER_VEC. */
167 cmpq $CHAR_PER_VEC, %rdx
168 ja L(more_1x_vec)
169 tzcnt %VRAX, %VRAX
170 cmpl %eax, %edx
171 jbe L(zero_0)
172L(first_vec_x0_ret):
173 leaq (%rdi, %rax, CHAR_SIZE), %rax
174# endif
175 ret
176
177 /* Only fits in first cache line for VEC_SIZE == 32. */
178# if VEC_SIZE == 32
179 .p2align 4,, 2
180L(zero_0):
181 xorl %eax, %eax
182 ret
183# endif
184
185 .p2align 4,, 9
186L(more_1x_vec):
187# ifdef USE_AS_WMEMCHR
188 /* If wmemchr still need to test if there was a match in first
189 VEC. Use bsf to test here so we can reuse
190 L(first_vec_x0_ret). */
191 bsf %VRAX, %VRAX
192 jnz L(first_vec_x0_ret)
193# endif
194
195L(page_cross_continue):
196# ifdef USE_AS_WMEMCHR
197 /* We can't use end of the buffer to re-calculate length for
198 wmemchr as len * CHAR_SIZE may overflow. */
199 leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
200 andq $(VEC_SIZE * -1), %rdi
201 subq %rdi, %rax
202 sarq $2, %rax
203 addq %rdx, %rax
204# else
205 leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax
206 andq $(VEC_SIZE * -1), %rdi
207 subq %rdi, %rax
208# endif
209
210 /* rax contains remaining length - 1. -1 so we can get imm8
211 encoding in a few additional places saving code size. */
212
213 /* Needed regardless of remaining length. */
214 VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
215 KMOV %k0, %VRDX
216
217 /* We cannot fold the above `sub %rdi, %rax` with the `cmp
218 $(CHAR_PER_VEC * 2), %rax` because its possible for a very
219 large length to overflow and cause the subtract to carry
220 despite length being above CHAR_PER_VEC * 2. */
221 cmpq $(CHAR_PER_VEC * 2 - 1), %rax
222 ja L(more_2x_vec)
223L(last_2x_vec):
224
225 test %VRDX, %VRDX
226 jnz L(first_vec_x1_check)
227
228 /* Check the end of data. NB: use 8-bit operations to save code
229 size. We no longer need the full-width of eax and will
230 perform a write-only operation over eax so there will be no
231 partial-register stalls. */
232 subb $(CHAR_PER_VEC * 1 - 1), %al
233 jle L(zero_0)
234
235 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
236 KMOV %k0, %VRCX
237# ifdef USE_AS_WMEMCHR
238 /* For wmemchr against we can't take advantage of tzcnt(0) ==
239 VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
240 test %VRCX, %VRCX
241 jz L(zero_0)
242# endif
243 tzcnt %VRCX, %VRCX
244 cmp %cl, %al
245
246 /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give
247 fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
248 not enough space before the next cache line to fit the `lea`
249 for return. */
250# if VEC_SIZE == 64
251 ja L(first_vec_x2_ret)
252L(zero_0):
253 xorl %eax, %eax
254 ret
255# else
256 jbe L(zero_0)
257 leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
258 ret
259# endif
260
261 .p2align 4,, 5
262L(first_vec_x1_check):
263 bsf %VRDX, %VRDX
264 cmpb %dl, %al
265 jb L(zero_4)
266 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
267 ret
268
269 /* Fits at the end of the cache line here for VEC_SIZE == 32.
270 */
271# if VEC_SIZE == 32
272L(zero_4):
273 xorl %eax, %eax
274 ret
275# endif
276
277
278 .p2align 4,, 4
279L(first_vec_x2):
280 bsf %VRCX, %VRCX
281L(first_vec_x2_ret):
282 leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
283 ret
284
285 /* Fits at the end of the cache line here for VEC_SIZE == 64.
286 */
287# if VEC_SIZE == 64
288L(zero_4):
289 xorl %eax, %eax
290 ret
291# endif
292
293 .p2align 4,, 4
294L(first_vec_x1):
295 bsf %VRDX, %VRDX
296 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
297 ret
298
299
300 .p2align 4,, 5
301L(more_2x_vec):
302 /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
303 length. */
304
305
306 /* Already computed matches for first VEC in rdx. */
307 test %VRDX, %VRDX
308 jnz L(first_vec_x1)
309
310
311 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
312 KMOV %k0, %VRCX
313 test %VRCX, %VRCX
314 jnz L(first_vec_x2)
315
316 /* Needed regardless of next length check. */
317 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
318 KMOV %k0, %VRCX
319
320 /* Check if we are near the end. */
321 cmpq $(CHAR_PER_VEC * 4 - 1), %rax
322 ja L(more_4x_vec)
323
324 test %VRCX, %VRCX
325 jnz L(first_vec_x3_check)
326
327 /* Use 8-bit instructions to save code size. We won't use full-
328 width eax again and will perform a write-only operation to
329 eax so no worries about partial-register stalls. */
330 subb $(CHAR_PER_VEC * 3), %al
331 jb L(zero_2)
332L(last_vec_check):
333 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
334 KMOV %k0, %VRCX
335# ifdef USE_AS_WMEMCHR
336 /* For wmemchr against we can't take advantage of tzcnt(0) ==
337 VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
338 test %VRCX, %VRCX
339 jz L(zero_2)
340# endif
341 tzcnt %VRCX, %VRCX
342 cmp %cl, %al
343 jae L(first_vec_x4_ret)
344L(zero_2):
345 xorl %eax, %eax
346 ret
347
348 /* Fits at the end of the cache line here for VEC_SIZE == 64.
349 For VEC_SIZE == 32 we put the return label at the end of
350 L(first_vec_x4). */
351# if VEC_SIZE == 64
352L(first_vec_x4_ret):
353 leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
354 ret
355# endif
356
357 .p2align 4,, 6
358L(first_vec_x4):
359 bsf %VRCX, %VRCX
360# if VEC_SIZE == 32
361 /* Place L(first_vec_x4_ret) here as we can't fit it in the same
362 cache line as where it is called from so we might as well
363 save code size by reusing return of L(first_vec_x4). */
364L(first_vec_x4_ret):
365# endif
366 leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
367 ret
368
369 .p2align 4,, 6
370L(first_vec_x3_check):
371 /* Need to adjust remaining length before checking. */
372 addb $-(CHAR_PER_VEC * 2), %al
373 bsf %VRCX, %VRCX
374 cmpb %cl, %al
375 jb L(zero_2)
376 leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
377 ret
378
379 .p2align 4,, 6
380L(first_vec_x3):
381 bsf %VRCX, %VRCX
382 leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
383 ret
384
385 .p2align 4,, 3
386# if !USE_TERN_IN_LOOP
387 .p2align 4,, 10
388# endif
389L(more_4x_vec):
390 test %VRCX, %VRCX
391 jnz L(first_vec_x3)
392
393 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
394 KMOV %k0, %VRCX
395 test %VRCX, %VRCX
396 jnz L(first_vec_x4)
397
398 subq $-(VEC_SIZE * 5), %rdi
399 subq $(CHAR_PER_VEC * 8), %rax
400 jb L(last_4x_vec)
401
402# ifdef USE_AS_WMEMCHR
403 movl %edi, %ecx
404# else
405 addq %rdi, %rax
406# endif
407
408
409# if VEC_SIZE == 64
410 /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
411 processor has partial register stalls (all have merging
412 uop). If that changes this can be removed. */
413 xorb %dil, %dil
414# else
415 andq $-(VEC_SIZE * 4), %rdi
416# endif
417
418# ifdef USE_AS_WMEMCHR
419 subl %edi, %ecx
420 sarl $2, %ecx
421 addq %rcx, %rax
422# else
423 subq %rdi, %rax
424# endif
425
426
427
428# if USE_TERN_IN_LOOP
429 /* copy VMATCH to low ymm so we can use vpcmpeq which is not
430 encodable with EVEX registers. NB: this is VEC_SIZE == 32
431 only as there is no way to encode vpcmpeq with zmm0-15. */
432 vmovdqa64 %VMATCH, %VMATCH_LO
433# endif
434
435 .p2align 4,, 11
436L(loop_4x_vec):
437 /* Two versions of the loop. One that does not require
438 vzeroupper by not using ymmm0-15 and another does that
439 require vzeroupper because it uses ymmm0-15. The reason why
440 ymm0-15 is used at all is because there is no EVEX encoding
441 vpcmpeq and with vpcmpeq this loop can be performed more
442 efficiently. The non-vzeroupper version is safe for RTM
443 while the vzeroupper version should be preferred if RTM are
444 not supported. Which loop version we use is determined by
445 USE_TERN_IN_LOOP. */
446
447# if USE_TERN_IN_LOOP
448 /* Since vptern can only take 3x vectors fastest to do 1 vec
449 separately with EVEX vpcmp. */
450# ifdef USE_AS_WMEMCHR
451 /* vptern can only accept masks for epi32/epi64 so can only save
452 instruction using not equals mask on vptern with wmemchr.
453 */
454 VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
455# else
456 VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
457# endif
458 /* Compare 3x with vpcmpeq and or them all together with vptern.
459 */
460 VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
461 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
462 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
463# ifdef USE_AS_WMEMCHR
464 /* This takes the not of or between VEC_lo(2), VEC_lo(3),
465 VEC_lo(4) as well as combines result from VEC(0) with zero
466 mask. */
467 vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
468 vpmovmskb %VMM_lo(4), %VRCX
469# else
470 /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
471 VEC_lo(4). */
472 vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
473 vpmovmskb %VMM_lo(4), %VRCX
474 KMOV %k1, %edx
475# endif
476
477# else
478 /* Loop version that uses EVEX encoding. */
479 VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
480 vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
481 vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
482 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
483 VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z}
484 VPTESTN %VMM(3), %VMM(3), %k2
485# endif
486
487
488 TEST_END ()
489 jnz L(loop_vec_ret)
490
491 subq $-(VEC_SIZE * 4), %rdi
492
493 subq $(CHAR_PER_VEC * 4), %rax
494 jae L(loop_4x_vec)
495
496 /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
497 */
498 COND_VZEROUPPER
499
500 .p2align 4,, 10
501L(last_4x_vec):
502 /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
503 instructions on eax from here on out. */
504# if CHAR_PER_VEC != 64
505 andl $(CHAR_PER_VEC * 4 - 1), %eax
506# endif
507 VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
508 subq $(VEC_SIZE * 1), %rdi
509 KMOV %k0, %VRDX
510 cmpb $(CHAR_PER_VEC * 2 - 1), %al
511 jbe L(last_2x_vec)
512 test %VRDX, %VRDX
513 jnz L(last_vec_x1_novzero)
514
515 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
516 KMOV %k0, %VRDX
517 test %VRDX, %VRDX
518 jnz L(last_vec_x2_novzero)
519
520 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
521 KMOV %k0, %VRCX
522 test %VRCX, %VRCX
523 jnz L(first_vec_x3_check)
524
525 subb $(CHAR_PER_VEC * 3), %al
526 jae L(last_vec_check)
527
528 xorl %eax, %eax
529 ret
530
531# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
532L(last_vec_x2_novzero):
533 addq $VEC_SIZE, %rdi
534L(last_vec_x1_novzero):
535 bsf %VRDX, %VRDX
536 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
537 ret
538# endif
539
540# if CHAR_PER_VEC == 64
541 /* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
542 64 it needs a separate return label. */
543 .p2align 4,, 4
544L(last_vec_x2):
545L(last_vec_x2_novzero):
546 bsf %VRDX, %VRDX
547 leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
548 ret
549# endif
550
551 .p2align 4,, 4
552L(loop_vec_ret):
553# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
554 KMOV %k1, %VRAX
555 inc %MASK_GPR(rax)
556# else
557 test %VRDX, %VRDX
558# endif
559 jnz L(last_vec_x0)
560
561
562# if USE_TERN_IN_LOOP
563 vpmovmskb %VMM_lo(2), %VRDX
564# else
565 VPTESTN %VMM(2), %VMM(2), %k1
566 KMOV %k1, %VRDX
567# endif
568 test %VRDX, %VRDX
569 jnz L(last_vec_x1)
570
571
572# if USE_TERN_IN_LOOP
573 vpmovmskb %VMM_lo(3), %VRDX
574# else
575 KMOV %k2, %VRDX
576# endif
577
578 /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
579 (only if used VEX encoded loop). */
580 COND_VZEROUPPER
581
582 /* Separate logic for CHAR_PER_VEC == 64 vs the rest. For
583 CHAR_PER_VEC we test the last 2x VEC separately, for
584 CHAR_PER_VEC <= 32 we can combine the results from the 2x
585 VEC in a single GPR. */
586# if CHAR_PER_VEC == 64
587# if USE_TERN_IN_LOOP
588# error "Unsupported"
589# endif
590
591
592 /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */
593 test %VRDX, %VRDX
594 jnz L(last_vec_x2)
595 KMOV %k3, %VRDX
596# else
597 /* CHAR_PER_VEC <= 32 so we can combine the results from the
598 last 2x VEC. */
599
600# if !USE_TERN_IN_LOOP
601 KMOV %k3, %VRCX
602# endif
603 salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx
604 addq %rcx, %rdx
605# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
606L(last_vec_x2_novzero):
607# endif
608# endif
609 bsf %rdx, %rdx
610 leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
611 ret
612
613 .p2align 4,, 8
614L(last_vec_x1):
615 COND_VZEROUPPER
616# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
617L(last_vec_x1_novzero):
618# endif
619 bsf %VRDX, %VRDX
620 leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
621 ret
622
623
624 .p2align 4,, 4
625L(last_vec_x0):
626 COND_VZEROUPPER
627 bsf %VGPR(GPR_X0), %VGPR(GPR_X0)
628# if GPR_X0_IS_RET
629 addq %rdi, %rax
630# else
631 leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax
632# endif
633 ret
634
635 .p2align 4,, 6
636L(page_cross):
637 /* Need to preserve eax to compute inbound bytes we are
638 checking. */
639# ifdef USE_AS_WMEMCHR
640 movl %eax, %ecx
641# else
642 xorl %ecx, %ecx
643 subl %eax, %ecx
644# endif
645
646 xorq %rdi, %rax
647 VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
648 KMOV %k0, %VRAX
649
650# ifdef USE_AS_WMEMCHR
651 /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */
652 shrl $2, %ecx
653 andl $(CHAR_PER_VEC - 1), %ecx
654# endif
655
656
657 shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
658
659# ifdef USE_AS_WMEMCHR
660 negl %ecx
661# endif
662
663 /* mask lower bits from ecx (negative eax) to get bytes till
664 next VEC. */
665 andl $(CHAR_PER_VEC - 1), %ecx
666
667 /* Check if VEC is entirely contained in the remainder of the
668 page. */
669 cmpq %rcx, %rdx
670 jbe L(page_cross_ret)
671
672 /* Length crosses the page so if rax is zero (no matches)
673 continue. */
674 test %VRAX, %VRAX
675 jz L(page_cross_continue)
676
677 /* if rdx > rcx then any match here must be in [buf:buf + len].
678 */
679 tzcnt %VRAX, %VRAX
680# ifdef USE_AS_WMEMCHR
681 leaq (%rdi, %rax, CHAR_SIZE), %rax
682# else
683 addq %rdi, %rax
684# endif
685 ret
686
687 .p2align 4,, 2
688L(page_cross_zero):
689 xorl %eax, %eax
690 ret
691
692 .p2align 4,, 4
693L(page_cross_ret):
694 /* Search is entirely contained in page cross case. */
695# ifdef USE_AS_WMEMCHR
696 test %VRAX, %VRAX
697 jz L(page_cross_zero)
698# endif
699 tzcnt %VRAX, %VRAX
700 cmpl %eax, %edx
701 jbe L(page_cross_zero)
702# ifdef USE_AS_WMEMCHR
703 leaq (%rdi, %rax, CHAR_SIZE), %rax
704# else
705 addq %rdi, %rax
706# endif
707 ret
708END (MEMCHR)
709#endif
710

source code of glibc/sysdeps/x86_64/multiarch/memchr-evex.S