1/* Optimized memrchr with sse2 without bsf
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22# define CFI_PUSH(REG) \
23 cfi_adjust_cfa_offset (4); \
24 cfi_rel_offset (REG, 0)
25
26# define CFI_POP(REG) \
27 cfi_adjust_cfa_offset (-4); \
28 cfi_restore (REG)
29
30# define PUSH(REG) pushl REG; CFI_PUSH (REG)
31# define POP(REG) popl REG; CFI_POP (REG)
32
33# define PARMS 4
34# define STR1 PARMS
35# define STR2 STR1+4
36# define LEN STR2+4
37
38 atom_text_section
39ENTRY (__memrchr_sse2)
40 mov STR1(%esp), %ecx
41 movd STR2(%esp), %xmm1
42 mov LEN(%esp), %edx
43
44 sub $16, %edx
45 jbe L(length_less16)
46
47 punpcklbw %xmm1, %xmm1
48 add %edx, %ecx
49 punpcklbw %xmm1, %xmm1
50
51 movdqu (%ecx), %xmm0
52 pshufd $0, %xmm1, %xmm1
53 pcmpeqb %xmm1, %xmm0
54
55 pmovmskb %xmm0, %eax
56 test %eax, %eax
57 jnz L(exit_dispatch)
58
59 sub $64, %ecx
60 mov %ecx, %eax
61 and $15, %eax
62 jz L(loop_prolog)
63
64 lea 16(%ecx), %ecx
65 lea 16(%edx), %edx
66 sub %eax, %edx
67 and $-16, %ecx
68
69 .p2align 4
70/* Loop start on aligned string. */
71L(loop_prolog):
72 sub $64, %edx
73 jbe L(exit_loop)
74
75 movdqa 48(%ecx), %xmm0
76 pcmpeqb %xmm1, %xmm0
77 pmovmskb %xmm0, %eax
78 test %eax, %eax
79 jnz L(matches48)
80
81 movdqa 32(%ecx), %xmm2
82 pcmpeqb %xmm1, %xmm2
83 pmovmskb %xmm2, %eax
84 test %eax, %eax
85 jnz L(matches32)
86
87 movdqa 16(%ecx), %xmm3
88 pcmpeqb %xmm1, %xmm3
89 pmovmskb %xmm3, %eax
90 test %eax, %eax
91 jnz L(matches16)
92
93 movdqa (%ecx), %xmm4
94 pcmpeqb %xmm1, %xmm4
95 pmovmskb %xmm4, %eax
96 test %eax, %eax
97 jnz L(exit_dispatch)
98
99 sub $64, %ecx
100 sub $64, %edx
101 jbe L(exit_loop)
102
103 movdqa 48(%ecx), %xmm0
104 pcmpeqb %xmm1, %xmm0
105 pmovmskb %xmm0, %eax
106 test %eax, %eax
107 jnz L(matches48)
108
109 movdqa 32(%ecx), %xmm2
110 pcmpeqb %xmm1, %xmm2
111 pmovmskb %xmm2, %eax
112 test %eax, %eax
113 jnz L(matches32)
114
115 movdqa 16(%ecx), %xmm3
116 pcmpeqb %xmm1, %xmm3
117 pmovmskb %xmm3, %eax
118 test %eax, %eax
119 jnz L(matches16)
120
121 movdqa (%ecx), %xmm3
122 pcmpeqb %xmm1, %xmm3
123 pmovmskb %xmm3, %eax
124 test %eax, %eax
125 jnz L(exit_dispatch)
126
127 mov %ecx, %eax
128 and $63, %eax
129 test %eax, %eax
130 jz L(align64_loop)
131
132 lea 64(%ecx), %ecx
133 lea 64(%edx), %edx
134 and $-64, %ecx
135 sub %eax, %edx
136
137 .p2align 4
138L(align64_loop):
139 sub $64, %ecx
140 sub $64, %edx
141 jbe L(exit_loop)
142
143 movdqa (%ecx), %xmm0
144 movdqa 16(%ecx), %xmm2
145 movdqa 32(%ecx), %xmm3
146 movdqa 48(%ecx), %xmm4
147
148 pcmpeqb %xmm1, %xmm0
149 pcmpeqb %xmm1, %xmm2
150 pcmpeqb %xmm1, %xmm3
151 pcmpeqb %xmm1, %xmm4
152
153 pmaxub %xmm3, %xmm0
154 pmaxub %xmm4, %xmm2
155 pmaxub %xmm0, %xmm2
156 pmovmskb %xmm2, %eax
157
158 test %eax, %eax
159 jz L(align64_loop)
160
161 pmovmskb %xmm4, %eax
162 test %eax, %eax
163 jnz L(matches48)
164
165 pmovmskb %xmm3, %eax
166 test %eax, %eax
167 jnz L(matches32)
168
169 movdqa 16(%ecx), %xmm2
170
171 pcmpeqb %xmm1, %xmm2
172 pcmpeqb (%ecx), %xmm1
173
174 pmovmskb %xmm2, %eax
175 test %eax, %eax
176 jnz L(matches16)
177
178 pmovmskb %xmm1, %eax
179 test %ah, %ah
180 jnz L(exit_dispatch_high)
181 mov %al, %dl
182 and $15 << 4, %dl
183 jnz L(exit_dispatch_8)
184 test $0x08, %al
185 jnz L(exit_4)
186 test $0x04, %al
187 jnz L(exit_3)
188 test $0x02, %al
189 jnz L(exit_2)
190 mov %ecx, %eax
191 ret
192
193 .p2align 4
194L(exit_loop):
195 add $64, %edx
196 cmp $32, %edx
197 jbe L(exit_loop_32)
198
199 movdqa 48(%ecx), %xmm0
200 pcmpeqb %xmm1, %xmm0
201 pmovmskb %xmm0, %eax
202 test %eax, %eax
203 jnz L(matches48)
204
205 movdqa 32(%ecx), %xmm2
206 pcmpeqb %xmm1, %xmm2
207 pmovmskb %xmm2, %eax
208 test %eax, %eax
209 jnz L(matches32)
210
211 movdqa 16(%ecx), %xmm3
212 pcmpeqb %xmm1, %xmm3
213 pmovmskb %xmm3, %eax
214 test %eax, %eax
215 jnz L(matches16_1)
216 cmp $48, %edx
217 jbe L(return_null)
218
219 pcmpeqb (%ecx), %xmm1
220 pmovmskb %xmm1, %eax
221 test %eax, %eax
222 jnz L(matches0_1)
223 xor %eax, %eax
224 ret
225
226 .p2align 4
227L(exit_loop_32):
228 movdqa 48(%ecx), %xmm0
229 pcmpeqb %xmm1, %xmm0
230 pmovmskb %xmm0, %eax
231 test %eax, %eax
232 jnz L(matches48_1)
233 cmp $16, %edx
234 jbe L(return_null)
235
236 pcmpeqb 32(%ecx), %xmm1
237 pmovmskb %xmm1, %eax
238 test %eax, %eax
239 jnz L(matches32_1)
240 xor %eax, %eax
241 ret
242
243 .p2align 4
244L(matches16):
245 lea 16(%ecx), %ecx
246 test %ah, %ah
247 jnz L(exit_dispatch_high)
248 mov %al, %dl
249 and $15 << 4, %dl
250 jnz L(exit_dispatch_8)
251 test $0x08, %al
252 jnz L(exit_4)
253 test $0x04, %al
254 jnz L(exit_3)
255 test $0x02, %al
256 jnz L(exit_2)
257 mov %ecx, %eax
258 ret
259
260 .p2align 4
261L(matches32):
262 lea 32(%ecx), %ecx
263 test %ah, %ah
264 jnz L(exit_dispatch_high)
265 mov %al, %dl
266 and $15 << 4, %dl
267 jnz L(exit_dispatch_8)
268 test $0x08, %al
269 jnz L(exit_4)
270 test $0x04, %al
271 jnz L(exit_3)
272 test $0x02, %al
273 jnz L(exit_2)
274 mov %ecx, %eax
275 ret
276
277 .p2align 4
278L(matches48):
279 lea 48(%ecx), %ecx
280
281 .p2align 4
282L(exit_dispatch):
283 test %ah, %ah
284 jnz L(exit_dispatch_high)
285 mov %al, %dl
286 and $15 << 4, %dl
287 jnz L(exit_dispatch_8)
288 test $0x08, %al
289 jnz L(exit_4)
290 test $0x04, %al
291 jnz L(exit_3)
292 test $0x02, %al
293 jnz L(exit_2)
294 mov %ecx, %eax
295 ret
296
297 .p2align 4
298L(exit_dispatch_8):
299 test $0x80, %al
300 jnz L(exit_8)
301 test $0x40, %al
302 jnz L(exit_7)
303 test $0x20, %al
304 jnz L(exit_6)
305 lea 4(%ecx), %eax
306 ret
307
308 .p2align 4
309L(exit_dispatch_high):
310 mov %ah, %dh
311 and $15 << 4, %dh
312 jnz L(exit_dispatch_high_8)
313 test $0x08, %ah
314 jnz L(exit_12)
315 test $0x04, %ah
316 jnz L(exit_11)
317 test $0x02, %ah
318 jnz L(exit_10)
319 lea 8(%ecx), %eax
320 ret
321
322 .p2align 4
323L(exit_dispatch_high_8):
324 test $0x80, %ah
325 jnz L(exit_16)
326 test $0x40, %ah
327 jnz L(exit_15)
328 test $0x20, %ah
329 jnz L(exit_14)
330 lea 12(%ecx), %eax
331 ret
332
333 .p2align 4
334L(exit_2):
335 lea 1(%ecx), %eax
336 ret
337
338 .p2align 4
339L(exit_3):
340 lea 2(%ecx), %eax
341 ret
342
343 .p2align 4
344L(exit_4):
345 lea 3(%ecx), %eax
346 ret
347
348 .p2align 4
349L(exit_6):
350 lea 5(%ecx), %eax
351 ret
352
353 .p2align 4
354L(exit_7):
355 lea 6(%ecx), %eax
356 ret
357
358 .p2align 4
359L(exit_8):
360 lea 7(%ecx), %eax
361 ret
362
363 .p2align 4
364L(exit_10):
365 lea 9(%ecx), %eax
366 ret
367
368 .p2align 4
369L(exit_11):
370 lea 10(%ecx), %eax
371 ret
372
373 .p2align 4
374L(exit_12):
375 lea 11(%ecx), %eax
376 ret
377
378 .p2align 4
379L(exit_14):
380 lea 13(%ecx), %eax
381 ret
382
383 .p2align 4
384L(exit_15):
385 lea 14(%ecx), %eax
386 ret
387
388 .p2align 4
389L(exit_16):
390 lea 15(%ecx), %eax
391 ret
392
393 .p2align 4
394L(matches0_1):
395 lea -64(%edx), %edx
396
397 test %ah, %ah
398 jnz L(exit_dispatch_1_high)
399 mov %al, %ah
400 and $15 << 4, %ah
401 jnz L(exit_dispatch_1_8)
402 test $0x08, %al
403 jnz L(exit_1_4)
404 test $0x04, %al
405 jnz L(exit_1_3)
406 test $0x02, %al
407 jnz L(exit_1_2)
408 add $0, %edx
409 jl L(return_null)
410 mov %ecx, %eax
411 ret
412
413 .p2align 4
414L(matches16_1):
415 lea -48(%edx), %edx
416 lea 16(%ecx), %ecx
417
418 test %ah, %ah
419 jnz L(exit_dispatch_1_high)
420 mov %al, %ah
421 and $15 << 4, %ah
422 jnz L(exit_dispatch_1_8)
423 test $0x08, %al
424 jnz L(exit_1_4)
425 test $0x04, %al
426 jnz L(exit_1_3)
427 test $0x02, %al
428 jnz L(exit_1_2)
429 add $0, %edx
430 jl L(return_null)
431 mov %ecx, %eax
432 ret
433
434 .p2align 4
435L(matches32_1):
436 lea -32(%edx), %edx
437 lea 32(%ecx), %ecx
438
439 test %ah, %ah
440 jnz L(exit_dispatch_1_high)
441 mov %al, %ah
442 and $15 << 4, %ah
443 jnz L(exit_dispatch_1_8)
444 test $0x08, %al
445 jnz L(exit_1_4)
446 test $0x04, %al
447 jnz L(exit_1_3)
448 test $0x02, %al
449 jnz L(exit_1_2)
450 add $0, %edx
451 jl L(return_null)
452 mov %ecx, %eax
453 ret
454
455 .p2align 4
456L(matches48_1):
457 lea -16(%edx), %edx
458 lea 48(%ecx), %ecx
459
460 .p2align 4
461L(exit_dispatch_1):
462 test %ah, %ah
463 jnz L(exit_dispatch_1_high)
464 mov %al, %ah
465 and $15 << 4, %ah
466 jnz L(exit_dispatch_1_8)
467 test $0x08, %al
468 jnz L(exit_1_4)
469 test $0x04, %al
470 jnz L(exit_1_3)
471 test $0x02, %al
472 jnz L(exit_1_2)
473 add $0, %edx
474 jl L(return_null)
475 mov %ecx, %eax
476 ret
477
478 .p2align 4
479L(exit_dispatch_1_8):
480 test $0x80, %al
481 jnz L(exit_1_8)
482 test $0x40, %al
483 jnz L(exit_1_7)
484 test $0x20, %al
485 jnz L(exit_1_6)
486 add $4, %edx
487 jl L(return_null)
488 lea 4(%ecx), %eax
489 ret
490
491 .p2align 4
492L(exit_dispatch_1_high):
493 mov %ah, %al
494 and $15 << 4, %al
495 jnz L(exit_dispatch_1_high_8)
496 test $0x08, %ah
497 jnz L(exit_1_12)
498 test $0x04, %ah
499 jnz L(exit_1_11)
500 test $0x02, %ah
501 jnz L(exit_1_10)
502 add $8, %edx
503 jl L(return_null)
504 lea 8(%ecx), %eax
505 ret
506
507 .p2align 4
508L(exit_dispatch_1_high_8):
509 test $0x80, %ah
510 jnz L(exit_1_16)
511 test $0x40, %ah
512 jnz L(exit_1_15)
513 test $0x20, %ah
514 jnz L(exit_1_14)
515 add $12, %edx
516 jl L(return_null)
517 lea 12(%ecx), %eax
518 ret
519
520 .p2align 4
521L(exit_1_2):
522 add $1, %edx
523 jl L(return_null)
524 lea 1(%ecx), %eax
525 ret
526
527 .p2align 4
528L(exit_1_3):
529 add $2, %edx
530 jl L(return_null)
531 lea 2(%ecx), %eax
532 ret
533
534 .p2align 4
535L(exit_1_4):
536 add $3, %edx
537 jl L(return_null)
538 lea 3(%ecx), %eax
539 ret
540
541 .p2align 4
542L(exit_1_6):
543 add $5, %edx
544 jl L(return_null)
545 lea 5(%ecx), %eax
546 ret
547
548 .p2align 4
549L(exit_1_7):
550 add $6, %edx
551 jl L(return_null)
552 lea 6(%ecx), %eax
553 ret
554
555 .p2align 4
556L(exit_1_8):
557 add $7, %edx
558 jl L(return_null)
559 lea 7(%ecx), %eax
560 ret
561
562 .p2align 4
563L(exit_1_10):
564 add $9, %edx
565 jl L(return_null)
566 lea 9(%ecx), %eax
567 ret
568
569 .p2align 4
570L(exit_1_11):
571 add $10, %edx
572 jl L(return_null)
573 lea 10(%ecx), %eax
574 ret
575
576 .p2align 4
577L(exit_1_12):
578 add $11, %edx
579 jl L(return_null)
580 lea 11(%ecx), %eax
581 ret
582
583 .p2align 4
584L(exit_1_14):
585 add $13, %edx
586 jl L(return_null)
587 lea 13(%ecx), %eax
588 ret
589
590 .p2align 4
591L(exit_1_15):
592 add $14, %edx
593 jl L(return_null)
594 lea 14(%ecx), %eax
595 ret
596
597 .p2align 4
598L(exit_1_16):
599 add $15, %edx
600 jl L(return_null)
601 lea 15(%ecx), %eax
602 ret
603
604 .p2align 4
605L(return_null):
606 xor %eax, %eax
607 ret
608
609 .p2align 4
610L(length_less16_offset0):
611 mov %dl, %cl
612 pcmpeqb (%eax), %xmm1
613
614 mov $1, %edx
615 sal %cl, %edx
616 sub $1, %edx
617
618 mov %eax, %ecx
619 pmovmskb %xmm1, %eax
620
621 and %edx, %eax
622 test %eax, %eax
623 jnz L(exit_dispatch)
624
625 xor %eax, %eax
626 ret
627
628 .p2align 4
629L(length_less16):
630 punpcklbw %xmm1, %xmm1
631 add $16, %edx
632 je L(return_null)
633 punpcklbw %xmm1, %xmm1
634
635 mov %ecx, %eax
636 pshufd $0, %xmm1, %xmm1
637
638 and $15, %ecx
639 jz L(length_less16_offset0)
640
641 PUSH (%edi)
642
643 mov %cl, %dh
644 add %dl, %dh
645 and $-16, %eax
646
647 sub $16, %dh
648 ja L(length_less16_part2)
649
650 pcmpeqb (%eax), %xmm1
651 pmovmskb %xmm1, %edi
652
653 sar %cl, %edi
654 add %ecx, %eax
655 mov %dl, %cl
656
657 mov $1, %edx
658 sal %cl, %edx
659 sub $1, %edx
660
661 and %edx, %edi
662 test %edi, %edi
663 jz L(ret_null)
664
665 bsr %edi, %edi
666 add %edi, %eax
667 POP (%edi)
668 ret
669
670 CFI_PUSH (%edi)
671
672 .p2align 4
673L(length_less16_part2):
674 movdqa 16(%eax), %xmm2
675 pcmpeqb %xmm1, %xmm2
676 pmovmskb %xmm2, %edi
677
678 mov %cl, %ch
679
680 mov %dh, %cl
681 mov $1, %edx
682 sal %cl, %edx
683 sub $1, %edx
684
685 and %edx, %edi
686
687 test %edi, %edi
688 jnz L(length_less16_part2_return)
689
690 pcmpeqb (%eax), %xmm1
691 pmovmskb %xmm1, %edi
692
693 mov %ch, %cl
694 sar %cl, %edi
695 test %edi, %edi
696 jz L(ret_null)
697
698 bsr %edi, %edi
699 add %edi, %eax
700 xor %ch, %ch
701 add %ecx, %eax
702 POP (%edi)
703 ret
704
705 CFI_PUSH (%edi)
706
707 .p2align 4
708L(length_less16_part2_return):
709 bsr %edi, %edi
710 lea 16(%eax, %edi), %eax
711 POP (%edi)
712 ret
713
714 CFI_PUSH (%edi)
715
716 .p2align 4
717L(ret_null):
718 xor %eax, %eax
719 POP (%edi)
720 ret
721
722END (__memrchr_sse2)
723#endif
724

source code of glibc/sysdeps/i386/i686/multiarch/memrchr-sse2.S