1 | /* strrchr SSE2 without bsf and bsr |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # define CFI_PUSH(REG) \ |
24 | cfi_adjust_cfa_offset (4); \ |
25 | cfi_rel_offset (REG, 0) |
26 | |
27 | # define CFI_POP(REG) \ |
28 | cfi_adjust_cfa_offset (-4); \ |
29 | cfi_restore (REG) |
30 | |
31 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
32 | # define POP(REG) popl REG; CFI_POP (REG) |
33 | |
34 | # define PARMS 8 |
35 | # define ENTRANCE PUSH(%edi); |
36 | # define RETURN POP(%edi); ret; CFI_PUSH(%edi); |
37 | |
38 | # define STR1 PARMS |
39 | # define STR2 STR1+4 |
40 | |
41 | atom_text_section |
42 | ENTRY (__strrchr_sse2) |
43 | |
44 | ENTRANCE |
45 | mov STR1(%esp), %ecx |
46 | movd STR2(%esp), %xmm1 |
47 | |
48 | pxor %xmm2, %xmm2 |
49 | mov %ecx, %edi |
50 | punpcklbw %xmm1, %xmm1 |
51 | punpcklbw %xmm1, %xmm1 |
52 | /* ECX has OFFSET. */ |
53 | and $63, %ecx |
54 | cmp $48, %ecx |
55 | pshufd $0, %xmm1, %xmm1 |
56 | ja L(crosscache) |
57 | |
58 | /* unaligned string. */ |
59 | movdqu (%edi), %xmm0 |
60 | pcmpeqb %xmm0, %xmm2 |
61 | pcmpeqb %xmm1, %xmm0 |
62 | /* Find where NULL is. */ |
63 | pmovmskb %xmm2, %ecx |
64 | /* Check if there is a match. */ |
65 | pmovmskb %xmm0, %eax |
66 | add $16, %edi |
67 | |
68 | test %eax, %eax |
69 | jnz L(unaligned_match1) |
70 | |
71 | test %ecx, %ecx |
72 | jnz L(return_null) |
73 | |
74 | and $-16, %edi |
75 | |
76 | PUSH (%esi) |
77 | PUSH (%ebx) |
78 | |
79 | xor %ebx, %ebx |
80 | jmp L(loop) |
81 | |
82 | CFI_POP (%esi) |
83 | CFI_POP (%ebx) |
84 | |
85 | .p2align 4 |
86 | L(unaligned_match1): |
87 | test %ecx, %ecx |
88 | jnz L(prolog_find_zero_1) |
89 | |
90 | PUSH (%esi) |
91 | PUSH (%ebx) |
92 | |
93 | mov %eax, %ebx |
94 | mov %edi, %esi |
95 | and $-16, %edi |
96 | jmp L(loop) |
97 | |
98 | CFI_POP (%esi) |
99 | CFI_POP (%ebx) |
100 | |
101 | .p2align 4 |
102 | L(crosscache): |
103 | /* Hancle unaligned string. */ |
104 | and $15, %ecx |
105 | and $-16, %edi |
106 | pxor %xmm3, %xmm3 |
107 | movdqa (%edi), %xmm0 |
108 | pcmpeqb %xmm0, %xmm3 |
109 | pcmpeqb %xmm1, %xmm0 |
110 | /* Find where NULL is. */ |
111 | pmovmskb %xmm3, %edx |
112 | /* Check if there is a match. */ |
113 | pmovmskb %xmm0, %eax |
114 | /* Remove the leading bytes. */ |
115 | shr %cl, %edx |
116 | shr %cl, %eax |
117 | add $16, %edi |
118 | |
119 | test %eax, %eax |
120 | jnz L(unaligned_match) |
121 | |
122 | test %edx, %edx |
123 | jnz L(return_null) |
124 | |
125 | PUSH (%esi) |
126 | PUSH (%ebx) |
127 | |
128 | xor %ebx, %ebx |
129 | jmp L(loop) |
130 | |
131 | CFI_POP (%esi) |
132 | CFI_POP (%ebx) |
133 | |
134 | .p2align 4 |
135 | L(unaligned_match): |
136 | test %edx, %edx |
137 | jnz L(prolog_find_zero) |
138 | |
139 | PUSH (%esi) |
140 | PUSH (%ebx) |
141 | |
142 | mov %eax, %ebx |
143 | lea (%edi, %ecx), %esi |
144 | |
145 | /* Loop start on aligned string. */ |
146 | .p2align 4 |
147 | L(loop): |
148 | movdqa (%edi), %xmm0 |
149 | pcmpeqb %xmm0, %xmm2 |
150 | add $16, %edi |
151 | pcmpeqb %xmm1, %xmm0 |
152 | pmovmskb %xmm2, %ecx |
153 | pmovmskb %xmm0, %eax |
154 | or %eax, %ecx |
155 | jnz L(matches) |
156 | |
157 | movdqa (%edi), %xmm0 |
158 | pcmpeqb %xmm0, %xmm2 |
159 | add $16, %edi |
160 | pcmpeqb %xmm1, %xmm0 |
161 | pmovmskb %xmm2, %ecx |
162 | pmovmskb %xmm0, %eax |
163 | or %eax, %ecx |
164 | jnz L(matches) |
165 | |
166 | movdqa (%edi), %xmm0 |
167 | pcmpeqb %xmm0, %xmm2 |
168 | add $16, %edi |
169 | pcmpeqb %xmm1, %xmm0 |
170 | pmovmskb %xmm2, %ecx |
171 | pmovmskb %xmm0, %eax |
172 | or %eax, %ecx |
173 | jnz L(matches) |
174 | |
175 | movdqa (%edi), %xmm0 |
176 | pcmpeqb %xmm0, %xmm2 |
177 | add $16, %edi |
178 | pcmpeqb %xmm1, %xmm0 |
179 | pmovmskb %xmm2, %ecx |
180 | pmovmskb %xmm0, %eax |
181 | or %eax, %ecx |
182 | jz L(loop) |
183 | |
184 | L(matches): |
185 | test %eax, %eax |
186 | jnz L(match) |
187 | L(return_value): |
188 | test %ebx, %ebx |
189 | jz L(return_null_1) |
190 | mov %ebx, %eax |
191 | mov %esi, %edi |
192 | |
193 | POP (%ebx) |
194 | POP (%esi) |
195 | |
196 | jmp L(match_exit) |
197 | |
198 | CFI_PUSH (%ebx) |
199 | CFI_PUSH (%esi) |
200 | |
201 | .p2align 4 |
202 | L(return_null_1): |
203 | POP (%ebx) |
204 | POP (%esi) |
205 | |
206 | xor %eax, %eax |
207 | RETURN |
208 | |
209 | CFI_PUSH (%ebx) |
210 | CFI_PUSH (%esi) |
211 | |
212 | .p2align 4 |
213 | L(match): |
214 | pmovmskb %xmm2, %ecx |
215 | test %ecx, %ecx |
216 | jnz L(find_zero) |
217 | mov %eax, %ebx |
218 | mov %edi, %esi |
219 | jmp L(loop) |
220 | |
221 | .p2align 4 |
222 | L(find_zero): |
223 | test %cl, %cl |
224 | jz L(find_zero_high) |
225 | mov %cl, %dl |
226 | and $15, %dl |
227 | jz L(find_zero_8) |
228 | test $0x01, %cl |
229 | jnz L(FindZeroExit1) |
230 | test $0x02, %cl |
231 | jnz L(FindZeroExit2) |
232 | test $0x04, %cl |
233 | jnz L(FindZeroExit3) |
234 | and $1 << 4 - 1, %eax |
235 | jz L(return_value) |
236 | |
237 | POP (%ebx) |
238 | POP (%esi) |
239 | jmp L(match_exit) |
240 | |
241 | CFI_PUSH (%ebx) |
242 | CFI_PUSH (%esi) |
243 | |
244 | .p2align 4 |
245 | L(find_zero_8): |
246 | test $0x10, %cl |
247 | jnz L(FindZeroExit5) |
248 | test $0x20, %cl |
249 | jnz L(FindZeroExit6) |
250 | test $0x40, %cl |
251 | jnz L(FindZeroExit7) |
252 | and $1 << 8 - 1, %eax |
253 | jz L(return_value) |
254 | |
255 | POP (%ebx) |
256 | POP (%esi) |
257 | jmp L(match_exit) |
258 | |
259 | CFI_PUSH (%ebx) |
260 | CFI_PUSH (%esi) |
261 | |
262 | .p2align 4 |
263 | L(find_zero_high): |
264 | mov %ch, %dh |
265 | and $15, %dh |
266 | jz L(find_zero_high_8) |
267 | test $0x01, %ch |
268 | jnz L(FindZeroExit9) |
269 | test $0x02, %ch |
270 | jnz L(FindZeroExit10) |
271 | test $0x04, %ch |
272 | jnz L(FindZeroExit11) |
273 | and $1 << 12 - 1, %eax |
274 | jz L(return_value) |
275 | |
276 | POP (%ebx) |
277 | POP (%esi) |
278 | jmp L(match_exit) |
279 | |
280 | CFI_PUSH (%ebx) |
281 | CFI_PUSH (%esi) |
282 | |
283 | .p2align 4 |
284 | L(find_zero_high_8): |
285 | test $0x10, %ch |
286 | jnz L(FindZeroExit13) |
287 | test $0x20, %ch |
288 | jnz L(FindZeroExit14) |
289 | test $0x40, %ch |
290 | jnz L(FindZeroExit15) |
291 | and $1 << 16 - 1, %eax |
292 | jz L(return_value) |
293 | |
294 | POP (%ebx) |
295 | POP (%esi) |
296 | jmp L(match_exit) |
297 | |
298 | CFI_PUSH (%ebx) |
299 | CFI_PUSH (%esi) |
300 | |
301 | .p2align 4 |
302 | L(FindZeroExit1): |
303 | and $1, %eax |
304 | jz L(return_value) |
305 | |
306 | POP (%ebx) |
307 | POP (%esi) |
308 | jmp L(match_exit) |
309 | |
310 | CFI_PUSH (%ebx) |
311 | CFI_PUSH (%esi) |
312 | |
313 | .p2align 4 |
314 | L(FindZeroExit2): |
315 | and $1 << 2 - 1, %eax |
316 | jz L(return_value) |
317 | |
318 | POP (%ebx) |
319 | POP (%esi) |
320 | jmp L(match_exit) |
321 | |
322 | CFI_PUSH (%ebx) |
323 | CFI_PUSH (%esi) |
324 | |
325 | .p2align 4 |
326 | L(FindZeroExit3): |
327 | and $1 << 3 - 1, %eax |
328 | jz L(return_value) |
329 | |
330 | POP (%ebx) |
331 | POP (%esi) |
332 | jmp L(match_exit) |
333 | |
334 | CFI_PUSH (%ebx) |
335 | CFI_PUSH (%esi) |
336 | |
337 | .p2align 4 |
338 | L(FindZeroExit5): |
339 | and $1 << 5 - 1, %eax |
340 | jz L(return_value) |
341 | |
342 | POP (%ebx) |
343 | POP (%esi) |
344 | jmp L(match_exit) |
345 | |
346 | CFI_PUSH (%ebx) |
347 | CFI_PUSH (%esi) |
348 | |
349 | .p2align 4 |
350 | L(FindZeroExit6): |
351 | and $1 << 6 - 1, %eax |
352 | jz L(return_value) |
353 | |
354 | POP (%ebx) |
355 | POP (%esi) |
356 | jmp L(match_exit) |
357 | |
358 | CFI_PUSH (%ebx) |
359 | CFI_PUSH (%esi) |
360 | |
361 | .p2align 4 |
362 | L(FindZeroExit7): |
363 | and $1 << 7 - 1, %eax |
364 | jz L(return_value) |
365 | |
366 | POP (%ebx) |
367 | POP (%esi) |
368 | jmp L(match_exit) |
369 | |
370 | CFI_PUSH (%ebx) |
371 | CFI_PUSH (%esi) |
372 | |
373 | .p2align 4 |
374 | L(FindZeroExit9): |
375 | and $1 << 9 - 1, %eax |
376 | jz L(return_value) |
377 | |
378 | POP (%ebx) |
379 | POP (%esi) |
380 | jmp L(match_exit) |
381 | |
382 | CFI_PUSH (%ebx) |
383 | CFI_PUSH (%esi) |
384 | |
385 | .p2align 4 |
386 | L(FindZeroExit10): |
387 | and $1 << 10 - 1, %eax |
388 | jz L(return_value) |
389 | |
390 | POP (%ebx) |
391 | POP (%esi) |
392 | jmp L(match_exit) |
393 | |
394 | CFI_PUSH (%ebx) |
395 | CFI_PUSH (%esi) |
396 | |
397 | .p2align 4 |
398 | L(FindZeroExit11): |
399 | and $1 << 11 - 1, %eax |
400 | jz L(return_value) |
401 | |
402 | POP (%ebx) |
403 | POP (%esi) |
404 | jmp L(match_exit) |
405 | |
406 | CFI_PUSH (%ebx) |
407 | CFI_PUSH (%esi) |
408 | |
409 | .p2align 4 |
410 | L(FindZeroExit13): |
411 | and $1 << 13 - 1, %eax |
412 | jz L(return_value) |
413 | |
414 | POP (%ebx) |
415 | POP (%esi) |
416 | jmp L(match_exit) |
417 | |
418 | CFI_PUSH (%ebx) |
419 | CFI_PUSH (%esi) |
420 | |
421 | .p2align 4 |
422 | L(FindZeroExit14): |
423 | and $1 << 14 - 1, %eax |
424 | jz L(return_value) |
425 | |
426 | POP (%ebx) |
427 | POP (%esi) |
428 | jmp L(match_exit) |
429 | |
430 | CFI_PUSH (%ebx) |
431 | CFI_PUSH (%esi) |
432 | |
433 | .p2align 4 |
434 | L(FindZeroExit15): |
435 | and $1 << 15 - 1, %eax |
436 | jz L(return_value) |
437 | |
438 | POP (%ebx) |
439 | POP (%esi) |
440 | |
441 | .p2align 4 |
442 | L(match_exit): |
443 | test %ah, %ah |
444 | jnz L(match_exit_high) |
445 | mov %al, %dl |
446 | and $15 << 4, %dl |
447 | jnz L(match_exit_8) |
448 | test $0x08, %al |
449 | jnz L(Exit4) |
450 | test $0x04, %al |
451 | jnz L(Exit3) |
452 | test $0x02, %al |
453 | jnz L(Exit2) |
454 | lea -16(%edi), %eax |
455 | RETURN |
456 | |
457 | .p2align 4 |
458 | L(match_exit_8): |
459 | test $0x80, %al |
460 | jnz L(Exit8) |
461 | test $0x40, %al |
462 | jnz L(Exit7) |
463 | test $0x20, %al |
464 | jnz L(Exit6) |
465 | lea -12(%edi), %eax |
466 | RETURN |
467 | |
468 | .p2align 4 |
469 | L(match_exit_high): |
470 | mov %ah, %dh |
471 | and $15 << 4, %dh |
472 | jnz L(match_exit_high_8) |
473 | test $0x08, %ah |
474 | jnz L(Exit12) |
475 | test $0x04, %ah |
476 | jnz L(Exit11) |
477 | test $0x02, %ah |
478 | jnz L(Exit10) |
479 | lea -8(%edi), %eax |
480 | RETURN |
481 | |
482 | .p2align 4 |
483 | L(match_exit_high_8): |
484 | test $0x80, %ah |
485 | jnz L(Exit16) |
486 | test $0x40, %ah |
487 | jnz L(Exit15) |
488 | test $0x20, %ah |
489 | jnz L(Exit14) |
490 | lea -4(%edi), %eax |
491 | RETURN |
492 | |
493 | .p2align 4 |
494 | L(Exit2): |
495 | lea -15(%edi), %eax |
496 | RETURN |
497 | |
498 | .p2align 4 |
499 | L(Exit3): |
500 | lea -14(%edi), %eax |
501 | RETURN |
502 | |
503 | .p2align 4 |
504 | L(Exit4): |
505 | lea -13(%edi), %eax |
506 | RETURN |
507 | |
508 | .p2align 4 |
509 | L(Exit6): |
510 | lea -11(%edi), %eax |
511 | RETURN |
512 | |
513 | .p2align 4 |
514 | L(Exit7): |
515 | lea -10(%edi), %eax |
516 | RETURN |
517 | |
518 | .p2align 4 |
519 | L(Exit8): |
520 | lea -9(%edi), %eax |
521 | RETURN |
522 | |
523 | .p2align 4 |
524 | L(Exit10): |
525 | lea -7(%edi), %eax |
526 | RETURN |
527 | |
528 | .p2align 4 |
529 | L(Exit11): |
530 | lea -6(%edi), %eax |
531 | RETURN |
532 | |
533 | .p2align 4 |
534 | L(Exit12): |
535 | lea -5(%edi), %eax |
536 | RETURN |
537 | |
538 | .p2align 4 |
539 | L(Exit14): |
540 | lea -3(%edi), %eax |
541 | RETURN |
542 | |
543 | .p2align 4 |
544 | L(Exit15): |
545 | lea -2(%edi), %eax |
546 | RETURN |
547 | |
548 | .p2align 4 |
549 | L(Exit16): |
550 | lea -1(%edi), %eax |
551 | RETURN |
552 | |
553 | /* Return NULL. */ |
554 | .p2align 4 |
555 | L(return_null): |
556 | xor %eax, %eax |
557 | RETURN |
558 | |
559 | .p2align 4 |
560 | L(prolog_find_zero): |
561 | add %ecx, %edi |
562 | mov %edx, %ecx |
563 | L(prolog_find_zero_1): |
564 | test %cl, %cl |
565 | jz L(prolog_find_zero_high) |
566 | mov %cl, %dl |
567 | and $15, %dl |
568 | jz L(prolog_find_zero_8) |
569 | test $0x01, %cl |
570 | jnz L(PrologFindZeroExit1) |
571 | test $0x02, %cl |
572 | jnz L(PrologFindZeroExit2) |
573 | test $0x04, %cl |
574 | jnz L(PrologFindZeroExit3) |
575 | and $1 << 4 - 1, %eax |
576 | jnz L(match_exit) |
577 | xor %eax, %eax |
578 | RETURN |
579 | |
580 | .p2align 4 |
581 | L(prolog_find_zero_8): |
582 | test $0x10, %cl |
583 | jnz L(PrologFindZeroExit5) |
584 | test $0x20, %cl |
585 | jnz L(PrologFindZeroExit6) |
586 | test $0x40, %cl |
587 | jnz L(PrologFindZeroExit7) |
588 | and $1 << 8 - 1, %eax |
589 | jnz L(match_exit) |
590 | xor %eax, %eax |
591 | RETURN |
592 | |
593 | .p2align 4 |
594 | L(prolog_find_zero_high): |
595 | mov %ch, %dh |
596 | and $15, %dh |
597 | jz L(prolog_find_zero_high_8) |
598 | test $0x01, %ch |
599 | jnz L(PrologFindZeroExit9) |
600 | test $0x02, %ch |
601 | jnz L(PrologFindZeroExit10) |
602 | test $0x04, %ch |
603 | jnz L(PrologFindZeroExit11) |
604 | and $1 << 12 - 1, %eax |
605 | jnz L(match_exit) |
606 | xor %eax, %eax |
607 | RETURN |
608 | |
609 | .p2align 4 |
610 | L(prolog_find_zero_high_8): |
611 | test $0x10, %ch |
612 | jnz L(PrologFindZeroExit13) |
613 | test $0x20, %ch |
614 | jnz L(PrologFindZeroExit14) |
615 | test $0x40, %ch |
616 | jnz L(PrologFindZeroExit15) |
617 | and $1 << 16 - 1, %eax |
618 | jnz L(match_exit) |
619 | xor %eax, %eax |
620 | RETURN |
621 | |
622 | .p2align 4 |
623 | L(PrologFindZeroExit1): |
624 | and $1, %eax |
625 | jnz L(match_exit) |
626 | xor %eax, %eax |
627 | RETURN |
628 | |
629 | .p2align 4 |
630 | L(PrologFindZeroExit2): |
631 | and $1 << 2 - 1, %eax |
632 | jnz L(match_exit) |
633 | xor %eax, %eax |
634 | RETURN |
635 | |
636 | .p2align 4 |
637 | L(PrologFindZeroExit3): |
638 | and $1 << 3 - 1, %eax |
639 | jnz L(match_exit) |
640 | xor %eax, %eax |
641 | RETURN |
642 | |
643 | .p2align 4 |
644 | L(PrologFindZeroExit5): |
645 | and $1 << 5 - 1, %eax |
646 | jnz L(match_exit) |
647 | xor %eax, %eax |
648 | RETURN |
649 | |
650 | .p2align 4 |
651 | L(PrologFindZeroExit6): |
652 | and $1 << 6 - 1, %eax |
653 | jnz L(match_exit) |
654 | xor %eax, %eax |
655 | RETURN |
656 | |
657 | .p2align 4 |
658 | L(PrologFindZeroExit7): |
659 | and $1 << 7 - 1, %eax |
660 | jnz L(match_exit) |
661 | xor %eax, %eax |
662 | RETURN |
663 | |
664 | .p2align 4 |
665 | L(PrologFindZeroExit9): |
666 | and $1 << 9 - 1, %eax |
667 | jnz L(match_exit) |
668 | xor %eax, %eax |
669 | RETURN |
670 | |
671 | .p2align 4 |
672 | L(PrologFindZeroExit10): |
673 | and $1 << 10 - 1, %eax |
674 | jnz L(match_exit) |
675 | xor %eax, %eax |
676 | RETURN |
677 | |
678 | .p2align 4 |
679 | L(PrologFindZeroExit11): |
680 | and $1 << 11 - 1, %eax |
681 | jnz L(match_exit) |
682 | xor %eax, %eax |
683 | RETURN |
684 | |
685 | .p2align 4 |
686 | L(PrologFindZeroExit13): |
687 | and $1 << 13 - 1, %eax |
688 | jnz L(match_exit) |
689 | xor %eax, %eax |
690 | RETURN |
691 | |
692 | .p2align 4 |
693 | L(PrologFindZeroExit14): |
694 | and $1 << 14 - 1, %eax |
695 | jnz L(match_exit) |
696 | xor %eax, %eax |
697 | RETURN |
698 | |
699 | .p2align 4 |
700 | L(PrologFindZeroExit15): |
701 | and $1 << 15 - 1, %eax |
702 | jnz L(match_exit) |
703 | xor %eax, %eax |
704 | RETURN |
705 | |
706 | END (__strrchr_sse2) |
707 | #endif |
708 | |