1/* strrchr SSE2 without bsf and bsr
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
29 cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS 8
35# define ENTRANCE PUSH(%edi);
36# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
37
38# define STR1 PARMS
39# define STR2 STR1+4
40
41 atom_text_section
42ENTRY (__strrchr_sse2)
43
44 ENTRANCE
45 mov STR1(%esp), %ecx
46 movd STR2(%esp), %xmm1
47
48 pxor %xmm2, %xmm2
49 mov %ecx, %edi
50 punpcklbw %xmm1, %xmm1
51 punpcklbw %xmm1, %xmm1
52 /* ECX has OFFSET. */
53 and $63, %ecx
54 cmp $48, %ecx
55 pshufd $0, %xmm1, %xmm1
56 ja L(crosscache)
57
58/* unaligned string. */
59 movdqu (%edi), %xmm0
60 pcmpeqb %xmm0, %xmm2
61 pcmpeqb %xmm1, %xmm0
62 /* Find where NULL is. */
63 pmovmskb %xmm2, %ecx
64 /* Check if there is a match. */
65 pmovmskb %xmm0, %eax
66 add $16, %edi
67
68 test %eax, %eax
69 jnz L(unaligned_match1)
70
71 test %ecx, %ecx
72 jnz L(return_null)
73
74 and $-16, %edi
75
76 PUSH (%esi)
77 PUSH (%ebx)
78
79 xor %ebx, %ebx
80 jmp L(loop)
81
82 CFI_POP (%esi)
83 CFI_POP (%ebx)
84
85 .p2align 4
86L(unaligned_match1):
87 test %ecx, %ecx
88 jnz L(prolog_find_zero_1)
89
90 PUSH (%esi)
91 PUSH (%ebx)
92
93 mov %eax, %ebx
94 mov %edi, %esi
95 and $-16, %edi
96 jmp L(loop)
97
98 CFI_POP (%esi)
99 CFI_POP (%ebx)
100
101 .p2align 4
102L(crosscache):
103/* Hancle unaligned string. */
104 and $15, %ecx
105 and $-16, %edi
106 pxor %xmm3, %xmm3
107 movdqa (%edi), %xmm0
108 pcmpeqb %xmm0, %xmm3
109 pcmpeqb %xmm1, %xmm0
110 /* Find where NULL is. */
111 pmovmskb %xmm3, %edx
112 /* Check if there is a match. */
113 pmovmskb %xmm0, %eax
114 /* Remove the leading bytes. */
115 shr %cl, %edx
116 shr %cl, %eax
117 add $16, %edi
118
119 test %eax, %eax
120 jnz L(unaligned_match)
121
122 test %edx, %edx
123 jnz L(return_null)
124
125 PUSH (%esi)
126 PUSH (%ebx)
127
128 xor %ebx, %ebx
129 jmp L(loop)
130
131 CFI_POP (%esi)
132 CFI_POP (%ebx)
133
134 .p2align 4
135L(unaligned_match):
136 test %edx, %edx
137 jnz L(prolog_find_zero)
138
139 PUSH (%esi)
140 PUSH (%ebx)
141
142 mov %eax, %ebx
143 lea (%edi, %ecx), %esi
144
145/* Loop start on aligned string. */
146 .p2align 4
147L(loop):
148 movdqa (%edi), %xmm0
149 pcmpeqb %xmm0, %xmm2
150 add $16, %edi
151 pcmpeqb %xmm1, %xmm0
152 pmovmskb %xmm2, %ecx
153 pmovmskb %xmm0, %eax
154 or %eax, %ecx
155 jnz L(matches)
156
157 movdqa (%edi), %xmm0
158 pcmpeqb %xmm0, %xmm2
159 add $16, %edi
160 pcmpeqb %xmm1, %xmm0
161 pmovmskb %xmm2, %ecx
162 pmovmskb %xmm0, %eax
163 or %eax, %ecx
164 jnz L(matches)
165
166 movdqa (%edi), %xmm0
167 pcmpeqb %xmm0, %xmm2
168 add $16, %edi
169 pcmpeqb %xmm1, %xmm0
170 pmovmskb %xmm2, %ecx
171 pmovmskb %xmm0, %eax
172 or %eax, %ecx
173 jnz L(matches)
174
175 movdqa (%edi), %xmm0
176 pcmpeqb %xmm0, %xmm2
177 add $16, %edi
178 pcmpeqb %xmm1, %xmm0
179 pmovmskb %xmm2, %ecx
180 pmovmskb %xmm0, %eax
181 or %eax, %ecx
182 jz L(loop)
183
184L(matches):
185 test %eax, %eax
186 jnz L(match)
187L(return_value):
188 test %ebx, %ebx
189 jz L(return_null_1)
190 mov %ebx, %eax
191 mov %esi, %edi
192
193 POP (%ebx)
194 POP (%esi)
195
196 jmp L(match_exit)
197
198 CFI_PUSH (%ebx)
199 CFI_PUSH (%esi)
200
201 .p2align 4
202L(return_null_1):
203 POP (%ebx)
204 POP (%esi)
205
206 xor %eax, %eax
207 RETURN
208
209 CFI_PUSH (%ebx)
210 CFI_PUSH (%esi)
211
212 .p2align 4
213L(match):
214 pmovmskb %xmm2, %ecx
215 test %ecx, %ecx
216 jnz L(find_zero)
217 mov %eax, %ebx
218 mov %edi, %esi
219 jmp L(loop)
220
221 .p2align 4
222L(find_zero):
223 test %cl, %cl
224 jz L(find_zero_high)
225 mov %cl, %dl
226 and $15, %dl
227 jz L(find_zero_8)
228 test $0x01, %cl
229 jnz L(FindZeroExit1)
230 test $0x02, %cl
231 jnz L(FindZeroExit2)
232 test $0x04, %cl
233 jnz L(FindZeroExit3)
234 and $1 << 4 - 1, %eax
235 jz L(return_value)
236
237 POP (%ebx)
238 POP (%esi)
239 jmp L(match_exit)
240
241 CFI_PUSH (%ebx)
242 CFI_PUSH (%esi)
243
244 .p2align 4
245L(find_zero_8):
246 test $0x10, %cl
247 jnz L(FindZeroExit5)
248 test $0x20, %cl
249 jnz L(FindZeroExit6)
250 test $0x40, %cl
251 jnz L(FindZeroExit7)
252 and $1 << 8 - 1, %eax
253 jz L(return_value)
254
255 POP (%ebx)
256 POP (%esi)
257 jmp L(match_exit)
258
259 CFI_PUSH (%ebx)
260 CFI_PUSH (%esi)
261
262 .p2align 4
263L(find_zero_high):
264 mov %ch, %dh
265 and $15, %dh
266 jz L(find_zero_high_8)
267 test $0x01, %ch
268 jnz L(FindZeroExit9)
269 test $0x02, %ch
270 jnz L(FindZeroExit10)
271 test $0x04, %ch
272 jnz L(FindZeroExit11)
273 and $1 << 12 - 1, %eax
274 jz L(return_value)
275
276 POP (%ebx)
277 POP (%esi)
278 jmp L(match_exit)
279
280 CFI_PUSH (%ebx)
281 CFI_PUSH (%esi)
282
283 .p2align 4
284L(find_zero_high_8):
285 test $0x10, %ch
286 jnz L(FindZeroExit13)
287 test $0x20, %ch
288 jnz L(FindZeroExit14)
289 test $0x40, %ch
290 jnz L(FindZeroExit15)
291 and $1 << 16 - 1, %eax
292 jz L(return_value)
293
294 POP (%ebx)
295 POP (%esi)
296 jmp L(match_exit)
297
298 CFI_PUSH (%ebx)
299 CFI_PUSH (%esi)
300
301 .p2align 4
302L(FindZeroExit1):
303 and $1, %eax
304 jz L(return_value)
305
306 POP (%ebx)
307 POP (%esi)
308 jmp L(match_exit)
309
310 CFI_PUSH (%ebx)
311 CFI_PUSH (%esi)
312
313 .p2align 4
314L(FindZeroExit2):
315 and $1 << 2 - 1, %eax
316 jz L(return_value)
317
318 POP (%ebx)
319 POP (%esi)
320 jmp L(match_exit)
321
322 CFI_PUSH (%ebx)
323 CFI_PUSH (%esi)
324
325 .p2align 4
326L(FindZeroExit3):
327 and $1 << 3 - 1, %eax
328 jz L(return_value)
329
330 POP (%ebx)
331 POP (%esi)
332 jmp L(match_exit)
333
334 CFI_PUSH (%ebx)
335 CFI_PUSH (%esi)
336
337 .p2align 4
338L(FindZeroExit5):
339 and $1 << 5 - 1, %eax
340 jz L(return_value)
341
342 POP (%ebx)
343 POP (%esi)
344 jmp L(match_exit)
345
346 CFI_PUSH (%ebx)
347 CFI_PUSH (%esi)
348
349 .p2align 4
350L(FindZeroExit6):
351 and $1 << 6 - 1, %eax
352 jz L(return_value)
353
354 POP (%ebx)
355 POP (%esi)
356 jmp L(match_exit)
357
358 CFI_PUSH (%ebx)
359 CFI_PUSH (%esi)
360
361 .p2align 4
362L(FindZeroExit7):
363 and $1 << 7 - 1, %eax
364 jz L(return_value)
365
366 POP (%ebx)
367 POP (%esi)
368 jmp L(match_exit)
369
370 CFI_PUSH (%ebx)
371 CFI_PUSH (%esi)
372
373 .p2align 4
374L(FindZeroExit9):
375 and $1 << 9 - 1, %eax
376 jz L(return_value)
377
378 POP (%ebx)
379 POP (%esi)
380 jmp L(match_exit)
381
382 CFI_PUSH (%ebx)
383 CFI_PUSH (%esi)
384
385 .p2align 4
386L(FindZeroExit10):
387 and $1 << 10 - 1, %eax
388 jz L(return_value)
389
390 POP (%ebx)
391 POP (%esi)
392 jmp L(match_exit)
393
394 CFI_PUSH (%ebx)
395 CFI_PUSH (%esi)
396
397 .p2align 4
398L(FindZeroExit11):
399 and $1 << 11 - 1, %eax
400 jz L(return_value)
401
402 POP (%ebx)
403 POP (%esi)
404 jmp L(match_exit)
405
406 CFI_PUSH (%ebx)
407 CFI_PUSH (%esi)
408
409 .p2align 4
410L(FindZeroExit13):
411 and $1 << 13 - 1, %eax
412 jz L(return_value)
413
414 POP (%ebx)
415 POP (%esi)
416 jmp L(match_exit)
417
418 CFI_PUSH (%ebx)
419 CFI_PUSH (%esi)
420
421 .p2align 4
422L(FindZeroExit14):
423 and $1 << 14 - 1, %eax
424 jz L(return_value)
425
426 POP (%ebx)
427 POP (%esi)
428 jmp L(match_exit)
429
430 CFI_PUSH (%ebx)
431 CFI_PUSH (%esi)
432
433 .p2align 4
434L(FindZeroExit15):
435 and $1 << 15 - 1, %eax
436 jz L(return_value)
437
438 POP (%ebx)
439 POP (%esi)
440
441 .p2align 4
442L(match_exit):
443 test %ah, %ah
444 jnz L(match_exit_high)
445 mov %al, %dl
446 and $15 << 4, %dl
447 jnz L(match_exit_8)
448 test $0x08, %al
449 jnz L(Exit4)
450 test $0x04, %al
451 jnz L(Exit3)
452 test $0x02, %al
453 jnz L(Exit2)
454 lea -16(%edi), %eax
455 RETURN
456
457 .p2align 4
458L(match_exit_8):
459 test $0x80, %al
460 jnz L(Exit8)
461 test $0x40, %al
462 jnz L(Exit7)
463 test $0x20, %al
464 jnz L(Exit6)
465 lea -12(%edi), %eax
466 RETURN
467
468 .p2align 4
469L(match_exit_high):
470 mov %ah, %dh
471 and $15 << 4, %dh
472 jnz L(match_exit_high_8)
473 test $0x08, %ah
474 jnz L(Exit12)
475 test $0x04, %ah
476 jnz L(Exit11)
477 test $0x02, %ah
478 jnz L(Exit10)
479 lea -8(%edi), %eax
480 RETURN
481
482 .p2align 4
483L(match_exit_high_8):
484 test $0x80, %ah
485 jnz L(Exit16)
486 test $0x40, %ah
487 jnz L(Exit15)
488 test $0x20, %ah
489 jnz L(Exit14)
490 lea -4(%edi), %eax
491 RETURN
492
493 .p2align 4
494L(Exit2):
495 lea -15(%edi), %eax
496 RETURN
497
498 .p2align 4
499L(Exit3):
500 lea -14(%edi), %eax
501 RETURN
502
503 .p2align 4
504L(Exit4):
505 lea -13(%edi), %eax
506 RETURN
507
508 .p2align 4
509L(Exit6):
510 lea -11(%edi), %eax
511 RETURN
512
513 .p2align 4
514L(Exit7):
515 lea -10(%edi), %eax
516 RETURN
517
518 .p2align 4
519L(Exit8):
520 lea -9(%edi), %eax
521 RETURN
522
523 .p2align 4
524L(Exit10):
525 lea -7(%edi), %eax
526 RETURN
527
528 .p2align 4
529L(Exit11):
530 lea -6(%edi), %eax
531 RETURN
532
533 .p2align 4
534L(Exit12):
535 lea -5(%edi), %eax
536 RETURN
537
538 .p2align 4
539L(Exit14):
540 lea -3(%edi), %eax
541 RETURN
542
543 .p2align 4
544L(Exit15):
545 lea -2(%edi), %eax
546 RETURN
547
548 .p2align 4
549L(Exit16):
550 lea -1(%edi), %eax
551 RETURN
552
553/* Return NULL. */
554 .p2align 4
555L(return_null):
556 xor %eax, %eax
557 RETURN
558
559 .p2align 4
560L(prolog_find_zero):
561 add %ecx, %edi
562 mov %edx, %ecx
563L(prolog_find_zero_1):
564 test %cl, %cl
565 jz L(prolog_find_zero_high)
566 mov %cl, %dl
567 and $15, %dl
568 jz L(prolog_find_zero_8)
569 test $0x01, %cl
570 jnz L(PrologFindZeroExit1)
571 test $0x02, %cl
572 jnz L(PrologFindZeroExit2)
573 test $0x04, %cl
574 jnz L(PrologFindZeroExit3)
575 and $1 << 4 - 1, %eax
576 jnz L(match_exit)
577 xor %eax, %eax
578 RETURN
579
580 .p2align 4
581L(prolog_find_zero_8):
582 test $0x10, %cl
583 jnz L(PrologFindZeroExit5)
584 test $0x20, %cl
585 jnz L(PrologFindZeroExit6)
586 test $0x40, %cl
587 jnz L(PrologFindZeroExit7)
588 and $1 << 8 - 1, %eax
589 jnz L(match_exit)
590 xor %eax, %eax
591 RETURN
592
593 .p2align 4
594L(prolog_find_zero_high):
595 mov %ch, %dh
596 and $15, %dh
597 jz L(prolog_find_zero_high_8)
598 test $0x01, %ch
599 jnz L(PrologFindZeroExit9)
600 test $0x02, %ch
601 jnz L(PrologFindZeroExit10)
602 test $0x04, %ch
603 jnz L(PrologFindZeroExit11)
604 and $1 << 12 - 1, %eax
605 jnz L(match_exit)
606 xor %eax, %eax
607 RETURN
608
609 .p2align 4
610L(prolog_find_zero_high_8):
611 test $0x10, %ch
612 jnz L(PrologFindZeroExit13)
613 test $0x20, %ch
614 jnz L(PrologFindZeroExit14)
615 test $0x40, %ch
616 jnz L(PrologFindZeroExit15)
617 and $1 << 16 - 1, %eax
618 jnz L(match_exit)
619 xor %eax, %eax
620 RETURN
621
622 .p2align 4
623L(PrologFindZeroExit1):
624 and $1, %eax
625 jnz L(match_exit)
626 xor %eax, %eax
627 RETURN
628
629 .p2align 4
630L(PrologFindZeroExit2):
631 and $1 << 2 - 1, %eax
632 jnz L(match_exit)
633 xor %eax, %eax
634 RETURN
635
636 .p2align 4
637L(PrologFindZeroExit3):
638 and $1 << 3 - 1, %eax
639 jnz L(match_exit)
640 xor %eax, %eax
641 RETURN
642
643 .p2align 4
644L(PrologFindZeroExit5):
645 and $1 << 5 - 1, %eax
646 jnz L(match_exit)
647 xor %eax, %eax
648 RETURN
649
650 .p2align 4
651L(PrologFindZeroExit6):
652 and $1 << 6 - 1, %eax
653 jnz L(match_exit)
654 xor %eax, %eax
655 RETURN
656
657 .p2align 4
658L(PrologFindZeroExit7):
659 and $1 << 7 - 1, %eax
660 jnz L(match_exit)
661 xor %eax, %eax
662 RETURN
663
664 .p2align 4
665L(PrologFindZeroExit9):
666 and $1 << 9 - 1, %eax
667 jnz L(match_exit)
668 xor %eax, %eax
669 RETURN
670
671 .p2align 4
672L(PrologFindZeroExit10):
673 and $1 << 10 - 1, %eax
674 jnz L(match_exit)
675 xor %eax, %eax
676 RETURN
677
678 .p2align 4
679L(PrologFindZeroExit11):
680 and $1 << 11 - 1, %eax
681 jnz L(match_exit)
682 xor %eax, %eax
683 RETURN
684
685 .p2align 4
686L(PrologFindZeroExit13):
687 and $1 << 13 - 1, %eax
688 jnz L(match_exit)
689 xor %eax, %eax
690 RETURN
691
692 .p2align 4
693L(PrologFindZeroExit14):
694 and $1 << 14 - 1, %eax
695 jnz L(match_exit)
696 xor %eax, %eax
697 RETURN
698
699 .p2align 4
700L(PrologFindZeroExit15):
701 and $1 << 15 - 1, %eax
702 jnz L(match_exit)
703 xor %eax, %eax
704 RETURN
705
706END (__strrchr_sse2)
707#endif
708

source code of glibc/sysdeps/i386/i686/multiarch/strrchr-sse2.S