1/* Optimized memchr with sse2 without bsf
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
29 cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# ifndef USE_AS_RAWMEMCHR
35# define ENTRANCE PUSH(%edi);
36# define PARMS 8
37# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
38# else
39# define ENTRANCE
40# define PARMS 4
41# endif
42
43# define STR1 PARMS
44# define STR2 STR1+4
45
46# ifndef USE_AS_RAWMEMCHR
47# define LEN STR2+4
48# endif
49
50# ifndef MEMCHR
51# define MEMCHR __memchr_sse2
52# endif
53
54 atom_text_section
55ENTRY (MEMCHR)
56 ENTRANCE
57 mov STR1(%esp), %ecx
58 movd STR2(%esp), %xmm1
59# ifndef USE_AS_RAWMEMCHR
60 mov LEN(%esp), %edx
61 test %edx, %edx
62 jz L(return_null)
63# endif
64
65 punpcklbw %xmm1, %xmm1
66# ifndef USE_AS_RAWMEMCHR
67 mov %ecx, %edi
68# else
69 mov %ecx, %edx
70# endif
71 punpcklbw %xmm1, %xmm1
72
73 and $63, %ecx
74 pshufd $0, %xmm1, %xmm1
75 cmp $48, %ecx
76 ja L(crosscache)
77
78# ifndef USE_AS_RAWMEMCHR
79 movdqu (%edi), %xmm0
80# else
81 movdqu (%edx), %xmm0
82# endif
83 pcmpeqb %xmm1, %xmm0
84 pmovmskb %xmm0, %eax
85 test %eax, %eax
86# ifndef USE_AS_RAWMEMCHR
87 jnz L(match_case2_prolog)
88
89 sub $16, %edx
90 jbe L(return_null)
91 lea 16(%edi), %edi
92 and $15, %ecx
93 and $-16, %edi
94 add %ecx, %edx
95# else
96 jnz L(match_case1_prolog)
97 lea 16(%edx), %edx
98 and $-16, %edx
99# endif
100 jmp L(loop_prolog)
101
102 .p2align 4
103L(crosscache):
104 and $15, %ecx
105# ifndef USE_AS_RAWMEMCHR
106 and $-16, %edi
107 movdqa (%edi), %xmm0
108# else
109 and $-16, %edx
110 movdqa (%edx), %xmm0
111# endif
112 pcmpeqb %xmm1, %xmm0
113 pmovmskb %xmm0, %eax
114 sar %cl, %eax
115 test %eax, %eax
116
117# ifndef USE_AS_RAWMEMCHR
118 jnz L(match_case2_prolog1)
119 /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using
120 "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
121 possible addition overflow. */
122 neg %ecx
123 add $16, %ecx
124 sub %ecx, %edx
125 jbe L(return_null)
126 lea 16(%edi), %edi
127# else
128 jnz L(match_case1_prolog1)
129 lea 16(%edx), %edx
130# endif
131
132 .p2align 4
133L(loop_prolog):
134# ifndef USE_AS_RAWMEMCHR
135 sub $64, %edx
136 jbe L(exit_loop)
137 movdqa (%edi), %xmm0
138# else
139 movdqa (%edx), %xmm0
140# endif
141 pcmpeqb %xmm1, %xmm0
142 xor %ecx, %ecx
143 pmovmskb %xmm0, %eax
144 test %eax, %eax
145 jnz L(match_case1)
146
147# ifndef USE_AS_RAWMEMCHR
148 movdqa 16(%edi), %xmm2
149# else
150 movdqa 16(%edx), %xmm2
151# endif
152 pcmpeqb %xmm1, %xmm2
153 lea 16(%ecx), %ecx
154 pmovmskb %xmm2, %eax
155 test %eax, %eax
156 jnz L(match_case1)
157
158# ifndef USE_AS_RAWMEMCHR
159 movdqa 32(%edi), %xmm3
160# else
161 movdqa 32(%edx), %xmm3
162# endif
163 pcmpeqb %xmm1, %xmm3
164 lea 16(%ecx), %ecx
165 pmovmskb %xmm3, %eax
166 test %eax, %eax
167 jnz L(match_case1)
168
169# ifndef USE_AS_RAWMEMCHR
170 movdqa 48(%edi), %xmm4
171# else
172 movdqa 48(%edx), %xmm4
173# endif
174 pcmpeqb %xmm1, %xmm4
175 lea 16(%ecx), %ecx
176 pmovmskb %xmm4, %eax
177 test %eax, %eax
178 jnz L(match_case1)
179
180# ifndef USE_AS_RAWMEMCHR
181 lea 64(%edi), %edi
182 sub $64, %edx
183 jbe L(exit_loop)
184
185 movdqa (%edi), %xmm0
186# else
187 lea 64(%edx), %edx
188 movdqa (%edx), %xmm0
189# endif
190 pcmpeqb %xmm1, %xmm0
191 xor %ecx, %ecx
192 pmovmskb %xmm0, %eax
193 test %eax, %eax
194 jnz L(match_case1)
195
196# ifndef USE_AS_RAWMEMCHR
197 movdqa 16(%edi), %xmm2
198# else
199 movdqa 16(%edx), %xmm2
200# endif
201 pcmpeqb %xmm1, %xmm2
202 lea 16(%ecx), %ecx
203 pmovmskb %xmm2, %eax
204 test %eax, %eax
205 jnz L(match_case1)
206
207# ifndef USE_AS_RAWMEMCHR
208 movdqa 32(%edi), %xmm3
209# else
210 movdqa 32(%edx), %xmm3
211# endif
212 pcmpeqb %xmm1, %xmm3
213 lea 16(%ecx), %ecx
214 pmovmskb %xmm3, %eax
215 test %eax, %eax
216 jnz L(match_case1)
217
218# ifndef USE_AS_RAWMEMCHR
219 movdqa 48(%edi), %xmm4
220# else
221 movdqa 48(%edx), %xmm4
222# endif
223 pcmpeqb %xmm1, %xmm4
224 lea 16(%ecx), %ecx
225 pmovmskb %xmm4, %eax
226 test %eax, %eax
227 jnz L(match_case1)
228
229# ifndef USE_AS_RAWMEMCHR
230 lea 64(%edi), %edi
231 mov %edi, %ecx
232 and $-64, %edi
233 and $63, %ecx
234 add %ecx, %edx
235# else
236 lea 64(%edx), %edx
237 and $-64, %edx
238# endif
239
240 .p2align 4
241L(align64_loop):
242
243# ifndef USE_AS_RAWMEMCHR
244 sub $64, %edx
245 jbe L(exit_loop)
246 movdqa (%edi), %xmm0
247 movdqa 16(%edi), %xmm2
248 movdqa 32(%edi), %xmm3
249 movdqa 48(%edi), %xmm4
250# else
251 movdqa (%edx), %xmm0
252 movdqa 16(%edx), %xmm2
253 movdqa 32(%edx), %xmm3
254 movdqa 48(%edx), %xmm4
255# endif
256 pcmpeqb %xmm1, %xmm0
257 pcmpeqb %xmm1, %xmm2
258 pcmpeqb %xmm1, %xmm3
259 pcmpeqb %xmm1, %xmm4
260
261 pmaxub %xmm0, %xmm3
262 pmaxub %xmm2, %xmm4
263 pmaxub %xmm3, %xmm4
264# ifndef USE_AS_RAWMEMCHR
265 add $64, %edi
266# else
267 add $64, %edx
268# endif
269 pmovmskb %xmm4, %eax
270
271 test %eax, %eax
272 jz L(align64_loop)
273
274# ifndef USE_AS_RAWMEMCHR
275 sub $64, %edi
276# else
277 sub $64, %edx
278# endif
279
280 pmovmskb %xmm0, %eax
281 xor %ecx, %ecx
282 test %eax, %eax
283 jnz L(match_case1)
284
285 pmovmskb %xmm2, %eax
286 lea 16(%ecx), %ecx
287 test %eax, %eax
288 jnz L(match_case1)
289
290# ifndef USE_AS_RAWMEMCHR
291 movdqa 32(%edi), %xmm3
292# else
293 movdqa 32(%edx), %xmm3
294# endif
295 pcmpeqb %xmm1, %xmm3
296 pmovmskb %xmm3, %eax
297 lea 16(%ecx), %ecx
298 test %eax, %eax
299 jnz L(match_case1)
300
301# ifndef USE_AS_RAWMEMCHR
302 pcmpeqb 48(%edi), %xmm1
303# else
304 pcmpeqb 48(%edx), %xmm1
305# endif
306 pmovmskb %xmm1, %eax
307 lea 16(%ecx), %ecx
308
309 .p2align 4
310L(match_case1):
311# ifndef USE_AS_RAWMEMCHR
312 add %ecx, %edi
313# else
314L(match_case1_prolog1):
315 add %ecx, %edx
316L(match_case1_prolog):
317# endif
318 test %al, %al
319 jz L(match_case1_high)
320 mov %al, %cl
321 and $15, %cl
322 jz L(match_case1_8)
323 test $0x01, %al
324 jnz L(ExitCase1_1)
325 test $0x02, %al
326 jnz L(ExitCase1_2)
327 test $0x04, %al
328 jnz L(ExitCase1_3)
329# ifndef USE_AS_RAWMEMCHR
330 lea 3(%edi), %eax
331 RETURN
332# else
333 lea 3(%edx), %eax
334 ret
335# endif
336
337 .p2align 4
338L(match_case1_8):
339 test $0x10, %al
340 jnz L(ExitCase1_5)
341 test $0x20, %al
342 jnz L(ExitCase1_6)
343 test $0x40, %al
344 jnz L(ExitCase1_7)
345# ifndef USE_AS_RAWMEMCHR
346 lea 7(%edi), %eax
347 RETURN
348# else
349 lea 7(%edx), %eax
350 ret
351# endif
352
353 .p2align 4
354L(match_case1_high):
355 mov %ah, %ch
356 and $15, %ch
357 jz L(match_case1_high_8)
358 test $0x01, %ah
359 jnz L(ExitCase1_9)
360 test $0x02, %ah
361 jnz L(ExitCase1_10)
362 test $0x04, %ah
363 jnz L(ExitCase1_11)
364# ifndef USE_AS_RAWMEMCHR
365 lea 11(%edi), %eax
366 RETURN
367# else
368 lea 11(%edx), %eax
369 ret
370# endif
371
372 .p2align 4
373L(match_case1_high_8):
374 test $0x10, %ah
375 jnz L(ExitCase1_13)
376 test $0x20, %ah
377 jnz L(ExitCase1_14)
378 test $0x40, %ah
379 jnz L(ExitCase1_15)
380# ifndef USE_AS_RAWMEMCHR
381 lea 15(%edi), %eax
382 RETURN
383# else
384 lea 15(%edx), %eax
385 ret
386# endif
387
388# ifndef USE_AS_RAWMEMCHR
389 .p2align 4
390L(exit_loop):
391 add $64, %edx
392
393 movdqa (%edi), %xmm0
394 pcmpeqb %xmm1, %xmm0
395 xor %ecx, %ecx
396 pmovmskb %xmm0, %eax
397 test %eax, %eax
398 jnz L(match_case2)
399 cmp $16, %edx
400 jbe L(return_null)
401
402 movdqa 16(%edi), %xmm2
403 pcmpeqb %xmm1, %xmm2
404 lea 16(%ecx), %ecx
405 pmovmskb %xmm2, %eax
406 test %eax, %eax
407 jnz L(match_case2)
408 cmp $32, %edx
409 jbe L(return_null)
410
411 movdqa 32(%edi), %xmm3
412 pcmpeqb %xmm1, %xmm3
413 lea 16(%ecx), %ecx
414 pmovmskb %xmm3, %eax
415 test %eax, %eax
416 jnz L(match_case2)
417 cmp $48, %edx
418 jbe L(return_null)
419
420 pcmpeqb 48(%edi), %xmm1
421 lea 16(%ecx), %ecx
422 pmovmskb %xmm1, %eax
423 test %eax, %eax
424 jnz L(match_case2)
425
426 xor %eax, %eax
427 RETURN
428# endif
429
430 .p2align 4
431L(ExitCase1_1):
432# ifndef USE_AS_RAWMEMCHR
433 mov %edi, %eax
434 RETURN
435# else
436 mov %edx, %eax
437 ret
438# endif
439
440 .p2align 4
441L(ExitCase1_2):
442# ifndef USE_AS_RAWMEMCHR
443 lea 1(%edi), %eax
444 RETURN
445# else
446 lea 1(%edx), %eax
447 ret
448# endif
449
450 .p2align 4
451L(ExitCase1_3):
452# ifndef USE_AS_RAWMEMCHR
453 lea 2(%edi), %eax
454 RETURN
455# else
456 lea 2(%edx), %eax
457 ret
458# endif
459
460 .p2align 4
461L(ExitCase1_5):
462# ifndef USE_AS_RAWMEMCHR
463 lea 4(%edi), %eax
464 RETURN
465# else
466 lea 4(%edx), %eax
467 ret
468# endif
469
470 .p2align 4
471L(ExitCase1_6):
472# ifndef USE_AS_RAWMEMCHR
473 lea 5(%edi), %eax
474 RETURN
475# else
476 lea 5(%edx), %eax
477 ret
478# endif
479
480 .p2align 4
481L(ExitCase1_7):
482# ifndef USE_AS_RAWMEMCHR
483 lea 6(%edi), %eax
484 RETURN
485# else
486 lea 6(%edx), %eax
487 ret
488# endif
489
490 .p2align 4
491L(ExitCase1_9):
492# ifndef USE_AS_RAWMEMCHR
493 lea 8(%edi), %eax
494 RETURN
495# else
496 lea 8(%edx), %eax
497 ret
498# endif
499
500 .p2align 4
501L(ExitCase1_10):
502# ifndef USE_AS_RAWMEMCHR
503 lea 9(%edi), %eax
504 RETURN
505# else
506 lea 9(%edx), %eax
507 ret
508# endif
509
510 .p2align 4
511L(ExitCase1_11):
512# ifndef USE_AS_RAWMEMCHR
513 lea 10(%edi), %eax
514 RETURN
515# else
516 lea 10(%edx), %eax
517 ret
518# endif
519
520 .p2align 4
521L(ExitCase1_13):
522# ifndef USE_AS_RAWMEMCHR
523 lea 12(%edi), %eax
524 RETURN
525# else
526 lea 12(%edx), %eax
527 ret
528# endif
529
530 .p2align 4
531L(ExitCase1_14):
532# ifndef USE_AS_RAWMEMCHR
533 lea 13(%edi), %eax
534 RETURN
535# else
536 lea 13(%edx), %eax
537 ret
538# endif
539
540 .p2align 4
541L(ExitCase1_15):
542# ifndef USE_AS_RAWMEMCHR
543 lea 14(%edi), %eax
544 RETURN
545# else
546 lea 14(%edx), %eax
547 ret
548# endif
549
550# ifndef USE_AS_RAWMEMCHR
551 .p2align 4
552L(match_case2):
553 sub %ecx, %edx
554L(match_case2_prolog1):
555 add %ecx, %edi
556L(match_case2_prolog):
557 test %al, %al
558 jz L(match_case2_high)
559 mov %al, %cl
560 and $15, %cl
561 jz L(match_case2_8)
562 test $0x01, %al
563 jnz L(ExitCase2_1)
564 test $0x02, %al
565 jnz L(ExitCase2_2)
566 test $0x04, %al
567 jnz L(ExitCase2_3)
568 sub $4, %edx
569 jb L(return_null)
570 lea 3(%edi), %eax
571 RETURN
572
573 .p2align 4
574L(match_case2_8):
575 test $0x10, %al
576 jnz L(ExitCase2_5)
577 test $0x20, %al
578 jnz L(ExitCase2_6)
579 test $0x40, %al
580 jnz L(ExitCase2_7)
581 sub $8, %edx
582 jb L(return_null)
583 lea 7(%edi), %eax
584 RETURN
585
586 .p2align 4
587L(match_case2_high):
588 mov %ah, %ch
589 and $15, %ch
590 jz L(match_case2_high_8)
591 test $0x01, %ah
592 jnz L(ExitCase2_9)
593 test $0x02, %ah
594 jnz L(ExitCase2_10)
595 test $0x04, %ah
596 jnz L(ExitCase2_11)
597 sub $12, %edx
598 jb L(return_null)
599 lea 11(%edi), %eax
600 RETURN
601
602 .p2align 4
603L(match_case2_high_8):
604 test $0x10, %ah
605 jnz L(ExitCase2_13)
606 test $0x20, %ah
607 jnz L(ExitCase2_14)
608 test $0x40, %ah
609 jnz L(ExitCase2_15)
610 sub $16, %edx
611 jb L(return_null)
612 lea 15(%edi), %eax
613 RETURN
614
615 .p2align 4
616L(ExitCase2_1):
617 mov %edi, %eax
618 RETURN
619
620 .p2align 4
621L(ExitCase2_2):
622 sub $2, %edx
623 jb L(return_null)
624 lea 1(%edi), %eax
625 RETURN
626
627 .p2align 4
628L(ExitCase2_3):
629 sub $3, %edx
630 jb L(return_null)
631 lea 2(%edi), %eax
632 RETURN
633
634 .p2align 4
635L(ExitCase2_5):
636 sub $5, %edx
637 jb L(return_null)
638 lea 4(%edi), %eax
639 RETURN
640
641 .p2align 4
642L(ExitCase2_6):
643 sub $6, %edx
644 jb L(return_null)
645 lea 5(%edi), %eax
646 RETURN
647
648 .p2align 4
649L(ExitCase2_7):
650 sub $7, %edx
651 jb L(return_null)
652 lea 6(%edi), %eax
653 RETURN
654
655 .p2align 4
656L(ExitCase2_9):
657 sub $9, %edx
658 jb L(return_null)
659 lea 8(%edi), %eax
660 RETURN
661
662 .p2align 4
663L(ExitCase2_10):
664 sub $10, %edx
665 jb L(return_null)
666 lea 9(%edi), %eax
667 RETURN
668
669 .p2align 4
670L(ExitCase2_11):
671 sub $11, %edx
672 jb L(return_null)
673 lea 10(%edi), %eax
674 RETURN
675
676 .p2align 4
677L(ExitCase2_13):
678 sub $13, %edx
679 jb L(return_null)
680 lea 12(%edi), %eax
681 RETURN
682
683 .p2align 4
684L(ExitCase2_14):
685 sub $14, %edx
686 jb L(return_null)
687 lea 13(%edi), %eax
688 RETURN
689
690 .p2align 4
691L(ExitCase2_15):
692 sub $15, %edx
693 jb L(return_null)
694 lea 14(%edi), %eax
695 RETURN
696# endif
697
698 .p2align 4
699L(return_null):
700 xor %eax, %eax
701# ifndef USE_AS_RAWMEMCHR
702 RETURN
703# else
704 ret
705# endif
706
707END (MEMCHR)
708#endif
709

source code of glibc/sysdeps/i386/i686/multiarch/memchr-sse2.S