1 | /* Optimized memrchr with sse2 without bsf |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | # define CFI_PUSH(REG) \ |
23 | cfi_adjust_cfa_offset (4); \ |
24 | cfi_rel_offset (REG, 0) |
25 | |
26 | # define CFI_POP(REG) \ |
27 | cfi_adjust_cfa_offset (-4); \ |
28 | cfi_restore (REG) |
29 | |
30 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
31 | # define POP(REG) popl REG; CFI_POP (REG) |
32 | |
33 | # define PARMS 4 |
34 | # define STR1 PARMS |
35 | # define STR2 STR1+4 |
36 | # define LEN STR2+4 |
37 | |
38 | atom_text_section |
39 | ENTRY (__memrchr_sse2) |
40 | mov STR1(%esp), %ecx |
41 | movd STR2(%esp), %xmm1 |
42 | mov LEN(%esp), %edx |
43 | |
44 | sub $16, %edx |
45 | jbe L(length_less16) |
46 | |
47 | punpcklbw %xmm1, %xmm1 |
48 | add %edx, %ecx |
49 | punpcklbw %xmm1, %xmm1 |
50 | |
51 | movdqu (%ecx), %xmm0 |
52 | pshufd $0, %xmm1, %xmm1 |
53 | pcmpeqb %xmm1, %xmm0 |
54 | |
55 | pmovmskb %xmm0, %eax |
56 | test %eax, %eax |
57 | jnz L(exit_dispatch) |
58 | |
59 | sub $64, %ecx |
60 | mov %ecx, %eax |
61 | and $15, %eax |
62 | jz L(loop_prolog) |
63 | |
64 | lea 16(%ecx), %ecx |
65 | lea 16(%edx), %edx |
66 | sub %eax, %edx |
67 | and $-16, %ecx |
68 | |
69 | .p2align 4 |
70 | /* Loop start on aligned string. */ |
71 | L(loop_prolog): |
72 | sub $64, %edx |
73 | jbe L(exit_loop) |
74 | |
75 | movdqa 48(%ecx), %xmm0 |
76 | pcmpeqb %xmm1, %xmm0 |
77 | pmovmskb %xmm0, %eax |
78 | test %eax, %eax |
79 | jnz L(matches48) |
80 | |
81 | movdqa 32(%ecx), %xmm2 |
82 | pcmpeqb %xmm1, %xmm2 |
83 | pmovmskb %xmm2, %eax |
84 | test %eax, %eax |
85 | jnz L(matches32) |
86 | |
87 | movdqa 16(%ecx), %xmm3 |
88 | pcmpeqb %xmm1, %xmm3 |
89 | pmovmskb %xmm3, %eax |
90 | test %eax, %eax |
91 | jnz L(matches16) |
92 | |
93 | movdqa (%ecx), %xmm4 |
94 | pcmpeqb %xmm1, %xmm4 |
95 | pmovmskb %xmm4, %eax |
96 | test %eax, %eax |
97 | jnz L(exit_dispatch) |
98 | |
99 | sub $64, %ecx |
100 | sub $64, %edx |
101 | jbe L(exit_loop) |
102 | |
103 | movdqa 48(%ecx), %xmm0 |
104 | pcmpeqb %xmm1, %xmm0 |
105 | pmovmskb %xmm0, %eax |
106 | test %eax, %eax |
107 | jnz L(matches48) |
108 | |
109 | movdqa 32(%ecx), %xmm2 |
110 | pcmpeqb %xmm1, %xmm2 |
111 | pmovmskb %xmm2, %eax |
112 | test %eax, %eax |
113 | jnz L(matches32) |
114 | |
115 | movdqa 16(%ecx), %xmm3 |
116 | pcmpeqb %xmm1, %xmm3 |
117 | pmovmskb %xmm3, %eax |
118 | test %eax, %eax |
119 | jnz L(matches16) |
120 | |
121 | movdqa (%ecx), %xmm3 |
122 | pcmpeqb %xmm1, %xmm3 |
123 | pmovmskb %xmm3, %eax |
124 | test %eax, %eax |
125 | jnz L(exit_dispatch) |
126 | |
127 | mov %ecx, %eax |
128 | and $63, %eax |
129 | test %eax, %eax |
130 | jz L(align64_loop) |
131 | |
132 | lea 64(%ecx), %ecx |
133 | lea 64(%edx), %edx |
134 | and $-64, %ecx |
135 | sub %eax, %edx |
136 | |
137 | .p2align 4 |
138 | L(align64_loop): |
139 | sub $64, %ecx |
140 | sub $64, %edx |
141 | jbe L(exit_loop) |
142 | |
143 | movdqa (%ecx), %xmm0 |
144 | movdqa 16(%ecx), %xmm2 |
145 | movdqa 32(%ecx), %xmm3 |
146 | movdqa 48(%ecx), %xmm4 |
147 | |
148 | pcmpeqb %xmm1, %xmm0 |
149 | pcmpeqb %xmm1, %xmm2 |
150 | pcmpeqb %xmm1, %xmm3 |
151 | pcmpeqb %xmm1, %xmm4 |
152 | |
153 | pmaxub %xmm3, %xmm0 |
154 | pmaxub %xmm4, %xmm2 |
155 | pmaxub %xmm0, %xmm2 |
156 | pmovmskb %xmm2, %eax |
157 | |
158 | test %eax, %eax |
159 | jz L(align64_loop) |
160 | |
161 | pmovmskb %xmm4, %eax |
162 | test %eax, %eax |
163 | jnz L(matches48) |
164 | |
165 | pmovmskb %xmm3, %eax |
166 | test %eax, %eax |
167 | jnz L(matches32) |
168 | |
169 | movdqa 16(%ecx), %xmm2 |
170 | |
171 | pcmpeqb %xmm1, %xmm2 |
172 | pcmpeqb (%ecx), %xmm1 |
173 | |
174 | pmovmskb %xmm2, %eax |
175 | test %eax, %eax |
176 | jnz L(matches16) |
177 | |
178 | pmovmskb %xmm1, %eax |
179 | test %ah, %ah |
180 | jnz L(exit_dispatch_high) |
181 | mov %al, %dl |
182 | and $15 << 4, %dl |
183 | jnz L(exit_dispatch_8) |
184 | test $0x08, %al |
185 | jnz L(exit_4) |
186 | test $0x04, %al |
187 | jnz L(exit_3) |
188 | test $0x02, %al |
189 | jnz L(exit_2) |
190 | mov %ecx, %eax |
191 | ret |
192 | |
193 | .p2align 4 |
194 | L(exit_loop): |
195 | add $64, %edx |
196 | cmp $32, %edx |
197 | jbe L(exit_loop_32) |
198 | |
199 | movdqa 48(%ecx), %xmm0 |
200 | pcmpeqb %xmm1, %xmm0 |
201 | pmovmskb %xmm0, %eax |
202 | test %eax, %eax |
203 | jnz L(matches48) |
204 | |
205 | movdqa 32(%ecx), %xmm2 |
206 | pcmpeqb %xmm1, %xmm2 |
207 | pmovmskb %xmm2, %eax |
208 | test %eax, %eax |
209 | jnz L(matches32) |
210 | |
211 | movdqa 16(%ecx), %xmm3 |
212 | pcmpeqb %xmm1, %xmm3 |
213 | pmovmskb %xmm3, %eax |
214 | test %eax, %eax |
215 | jnz L(matches16_1) |
216 | cmp $48, %edx |
217 | jbe L(return_null) |
218 | |
219 | pcmpeqb (%ecx), %xmm1 |
220 | pmovmskb %xmm1, %eax |
221 | test %eax, %eax |
222 | jnz L(matches0_1) |
223 | xor %eax, %eax |
224 | ret |
225 | |
226 | .p2align 4 |
227 | L(exit_loop_32): |
228 | movdqa 48(%ecx), %xmm0 |
229 | pcmpeqb %xmm1, %xmm0 |
230 | pmovmskb %xmm0, %eax |
231 | test %eax, %eax |
232 | jnz L(matches48_1) |
233 | cmp $16, %edx |
234 | jbe L(return_null) |
235 | |
236 | pcmpeqb 32(%ecx), %xmm1 |
237 | pmovmskb %xmm1, %eax |
238 | test %eax, %eax |
239 | jnz L(matches32_1) |
240 | xor %eax, %eax |
241 | ret |
242 | |
243 | .p2align 4 |
244 | L(matches16): |
245 | lea 16(%ecx), %ecx |
246 | test %ah, %ah |
247 | jnz L(exit_dispatch_high) |
248 | mov %al, %dl |
249 | and $15 << 4, %dl |
250 | jnz L(exit_dispatch_8) |
251 | test $0x08, %al |
252 | jnz L(exit_4) |
253 | test $0x04, %al |
254 | jnz L(exit_3) |
255 | test $0x02, %al |
256 | jnz L(exit_2) |
257 | mov %ecx, %eax |
258 | ret |
259 | |
260 | .p2align 4 |
261 | L(matches32): |
262 | lea 32(%ecx), %ecx |
263 | test %ah, %ah |
264 | jnz L(exit_dispatch_high) |
265 | mov %al, %dl |
266 | and $15 << 4, %dl |
267 | jnz L(exit_dispatch_8) |
268 | test $0x08, %al |
269 | jnz L(exit_4) |
270 | test $0x04, %al |
271 | jnz L(exit_3) |
272 | test $0x02, %al |
273 | jnz L(exit_2) |
274 | mov %ecx, %eax |
275 | ret |
276 | |
277 | .p2align 4 |
278 | L(matches48): |
279 | lea 48(%ecx), %ecx |
280 | |
281 | .p2align 4 |
282 | L(exit_dispatch): |
283 | test %ah, %ah |
284 | jnz L(exit_dispatch_high) |
285 | mov %al, %dl |
286 | and $15 << 4, %dl |
287 | jnz L(exit_dispatch_8) |
288 | test $0x08, %al |
289 | jnz L(exit_4) |
290 | test $0x04, %al |
291 | jnz L(exit_3) |
292 | test $0x02, %al |
293 | jnz L(exit_2) |
294 | mov %ecx, %eax |
295 | ret |
296 | |
297 | .p2align 4 |
298 | L(exit_dispatch_8): |
299 | test $0x80, %al |
300 | jnz L(exit_8) |
301 | test $0x40, %al |
302 | jnz L(exit_7) |
303 | test $0x20, %al |
304 | jnz L(exit_6) |
305 | lea 4(%ecx), %eax |
306 | ret |
307 | |
308 | .p2align 4 |
309 | L(exit_dispatch_high): |
310 | mov %ah, %dh |
311 | and $15 << 4, %dh |
312 | jnz L(exit_dispatch_high_8) |
313 | test $0x08, %ah |
314 | jnz L(exit_12) |
315 | test $0x04, %ah |
316 | jnz L(exit_11) |
317 | test $0x02, %ah |
318 | jnz L(exit_10) |
319 | lea 8(%ecx), %eax |
320 | ret |
321 | |
322 | .p2align 4 |
323 | L(exit_dispatch_high_8): |
324 | test $0x80, %ah |
325 | jnz L(exit_16) |
326 | test $0x40, %ah |
327 | jnz L(exit_15) |
328 | test $0x20, %ah |
329 | jnz L(exit_14) |
330 | lea 12(%ecx), %eax |
331 | ret |
332 | |
333 | .p2align 4 |
334 | L(exit_2): |
335 | lea 1(%ecx), %eax |
336 | ret |
337 | |
338 | .p2align 4 |
339 | L(exit_3): |
340 | lea 2(%ecx), %eax |
341 | ret |
342 | |
343 | .p2align 4 |
344 | L(exit_4): |
345 | lea 3(%ecx), %eax |
346 | ret |
347 | |
348 | .p2align 4 |
349 | L(exit_6): |
350 | lea 5(%ecx), %eax |
351 | ret |
352 | |
353 | .p2align 4 |
354 | L(exit_7): |
355 | lea 6(%ecx), %eax |
356 | ret |
357 | |
358 | .p2align 4 |
359 | L(exit_8): |
360 | lea 7(%ecx), %eax |
361 | ret |
362 | |
363 | .p2align 4 |
364 | L(exit_10): |
365 | lea 9(%ecx), %eax |
366 | ret |
367 | |
368 | .p2align 4 |
369 | L(exit_11): |
370 | lea 10(%ecx), %eax |
371 | ret |
372 | |
373 | .p2align 4 |
374 | L(exit_12): |
375 | lea 11(%ecx), %eax |
376 | ret |
377 | |
378 | .p2align 4 |
379 | L(exit_14): |
380 | lea 13(%ecx), %eax |
381 | ret |
382 | |
383 | .p2align 4 |
384 | L(exit_15): |
385 | lea 14(%ecx), %eax |
386 | ret |
387 | |
388 | .p2align 4 |
389 | L(exit_16): |
390 | lea 15(%ecx), %eax |
391 | ret |
392 | |
393 | .p2align 4 |
394 | L(matches0_1): |
395 | lea -64(%edx), %edx |
396 | |
397 | test %ah, %ah |
398 | jnz L(exit_dispatch_1_high) |
399 | mov %al, %ah |
400 | and $15 << 4, %ah |
401 | jnz L(exit_dispatch_1_8) |
402 | test $0x08, %al |
403 | jnz L(exit_1_4) |
404 | test $0x04, %al |
405 | jnz L(exit_1_3) |
406 | test $0x02, %al |
407 | jnz L(exit_1_2) |
408 | add $0, %edx |
409 | jl L(return_null) |
410 | mov %ecx, %eax |
411 | ret |
412 | |
413 | .p2align 4 |
414 | L(matches16_1): |
415 | lea -48(%edx), %edx |
416 | lea 16(%ecx), %ecx |
417 | |
418 | test %ah, %ah |
419 | jnz L(exit_dispatch_1_high) |
420 | mov %al, %ah |
421 | and $15 << 4, %ah |
422 | jnz L(exit_dispatch_1_8) |
423 | test $0x08, %al |
424 | jnz L(exit_1_4) |
425 | test $0x04, %al |
426 | jnz L(exit_1_3) |
427 | test $0x02, %al |
428 | jnz L(exit_1_2) |
429 | add $0, %edx |
430 | jl L(return_null) |
431 | mov %ecx, %eax |
432 | ret |
433 | |
434 | .p2align 4 |
435 | L(matches32_1): |
436 | lea -32(%edx), %edx |
437 | lea 32(%ecx), %ecx |
438 | |
439 | test %ah, %ah |
440 | jnz L(exit_dispatch_1_high) |
441 | mov %al, %ah |
442 | and $15 << 4, %ah |
443 | jnz L(exit_dispatch_1_8) |
444 | test $0x08, %al |
445 | jnz L(exit_1_4) |
446 | test $0x04, %al |
447 | jnz L(exit_1_3) |
448 | test $0x02, %al |
449 | jnz L(exit_1_2) |
450 | add $0, %edx |
451 | jl L(return_null) |
452 | mov %ecx, %eax |
453 | ret |
454 | |
455 | .p2align 4 |
456 | L(matches48_1): |
457 | lea -16(%edx), %edx |
458 | lea 48(%ecx), %ecx |
459 | |
460 | .p2align 4 |
461 | L(exit_dispatch_1): |
462 | test %ah, %ah |
463 | jnz L(exit_dispatch_1_high) |
464 | mov %al, %ah |
465 | and $15 << 4, %ah |
466 | jnz L(exit_dispatch_1_8) |
467 | test $0x08, %al |
468 | jnz L(exit_1_4) |
469 | test $0x04, %al |
470 | jnz L(exit_1_3) |
471 | test $0x02, %al |
472 | jnz L(exit_1_2) |
473 | add $0, %edx |
474 | jl L(return_null) |
475 | mov %ecx, %eax |
476 | ret |
477 | |
478 | .p2align 4 |
479 | L(exit_dispatch_1_8): |
480 | test $0x80, %al |
481 | jnz L(exit_1_8) |
482 | test $0x40, %al |
483 | jnz L(exit_1_7) |
484 | test $0x20, %al |
485 | jnz L(exit_1_6) |
486 | add $4, %edx |
487 | jl L(return_null) |
488 | lea 4(%ecx), %eax |
489 | ret |
490 | |
491 | .p2align 4 |
492 | L(exit_dispatch_1_high): |
493 | mov %ah, %al |
494 | and $15 << 4, %al |
495 | jnz L(exit_dispatch_1_high_8) |
496 | test $0x08, %ah |
497 | jnz L(exit_1_12) |
498 | test $0x04, %ah |
499 | jnz L(exit_1_11) |
500 | test $0x02, %ah |
501 | jnz L(exit_1_10) |
502 | add $8, %edx |
503 | jl L(return_null) |
504 | lea 8(%ecx), %eax |
505 | ret |
506 | |
507 | .p2align 4 |
508 | L(exit_dispatch_1_high_8): |
509 | test $0x80, %ah |
510 | jnz L(exit_1_16) |
511 | test $0x40, %ah |
512 | jnz L(exit_1_15) |
513 | test $0x20, %ah |
514 | jnz L(exit_1_14) |
515 | add $12, %edx |
516 | jl L(return_null) |
517 | lea 12(%ecx), %eax |
518 | ret |
519 | |
520 | .p2align 4 |
521 | L(exit_1_2): |
522 | add $1, %edx |
523 | jl L(return_null) |
524 | lea 1(%ecx), %eax |
525 | ret |
526 | |
527 | .p2align 4 |
528 | L(exit_1_3): |
529 | add $2, %edx |
530 | jl L(return_null) |
531 | lea 2(%ecx), %eax |
532 | ret |
533 | |
534 | .p2align 4 |
535 | L(exit_1_4): |
536 | add $3, %edx |
537 | jl L(return_null) |
538 | lea 3(%ecx), %eax |
539 | ret |
540 | |
541 | .p2align 4 |
542 | L(exit_1_6): |
543 | add $5, %edx |
544 | jl L(return_null) |
545 | lea 5(%ecx), %eax |
546 | ret |
547 | |
548 | .p2align 4 |
549 | L(exit_1_7): |
550 | add $6, %edx |
551 | jl L(return_null) |
552 | lea 6(%ecx), %eax |
553 | ret |
554 | |
555 | .p2align 4 |
556 | L(exit_1_8): |
557 | add $7, %edx |
558 | jl L(return_null) |
559 | lea 7(%ecx), %eax |
560 | ret |
561 | |
562 | .p2align 4 |
563 | L(exit_1_10): |
564 | add $9, %edx |
565 | jl L(return_null) |
566 | lea 9(%ecx), %eax |
567 | ret |
568 | |
569 | .p2align 4 |
570 | L(exit_1_11): |
571 | add $10, %edx |
572 | jl L(return_null) |
573 | lea 10(%ecx), %eax |
574 | ret |
575 | |
576 | .p2align 4 |
577 | L(exit_1_12): |
578 | add $11, %edx |
579 | jl L(return_null) |
580 | lea 11(%ecx), %eax |
581 | ret |
582 | |
583 | .p2align 4 |
584 | L(exit_1_14): |
585 | add $13, %edx |
586 | jl L(return_null) |
587 | lea 13(%ecx), %eax |
588 | ret |
589 | |
590 | .p2align 4 |
591 | L(exit_1_15): |
592 | add $14, %edx |
593 | jl L(return_null) |
594 | lea 14(%ecx), %eax |
595 | ret |
596 | |
597 | .p2align 4 |
598 | L(exit_1_16): |
599 | add $15, %edx |
600 | jl L(return_null) |
601 | lea 15(%ecx), %eax |
602 | ret |
603 | |
604 | .p2align 4 |
605 | L(return_null): |
606 | xor %eax, %eax |
607 | ret |
608 | |
609 | .p2align 4 |
610 | L(length_less16_offset0): |
611 | mov %dl, %cl |
612 | pcmpeqb (%eax), %xmm1 |
613 | |
614 | mov $1, %edx |
615 | sal %cl, %edx |
616 | sub $1, %edx |
617 | |
618 | mov %eax, %ecx |
619 | pmovmskb %xmm1, %eax |
620 | |
621 | and %edx, %eax |
622 | test %eax, %eax |
623 | jnz L(exit_dispatch) |
624 | |
625 | xor %eax, %eax |
626 | ret |
627 | |
628 | .p2align 4 |
629 | L(length_less16): |
630 | punpcklbw %xmm1, %xmm1 |
631 | add $16, %edx |
632 | je L(return_null) |
633 | punpcklbw %xmm1, %xmm1 |
634 | |
635 | mov %ecx, %eax |
636 | pshufd $0, %xmm1, %xmm1 |
637 | |
638 | and $15, %ecx |
639 | jz L(length_less16_offset0) |
640 | |
641 | PUSH (%edi) |
642 | |
643 | mov %cl, %dh |
644 | add %dl, %dh |
645 | and $-16, %eax |
646 | |
647 | sub $16, %dh |
648 | ja L(length_less16_part2) |
649 | |
650 | pcmpeqb (%eax), %xmm1 |
651 | pmovmskb %xmm1, %edi |
652 | |
653 | sar %cl, %edi |
654 | add %ecx, %eax |
655 | mov %dl, %cl |
656 | |
657 | mov $1, %edx |
658 | sal %cl, %edx |
659 | sub $1, %edx |
660 | |
661 | and %edx, %edi |
662 | test %edi, %edi |
663 | jz L(ret_null) |
664 | |
665 | bsr %edi, %edi |
666 | add %edi, %eax |
667 | POP (%edi) |
668 | ret |
669 | |
670 | CFI_PUSH (%edi) |
671 | |
672 | .p2align 4 |
673 | L(length_less16_part2): |
674 | movdqa 16(%eax), %xmm2 |
675 | pcmpeqb %xmm1, %xmm2 |
676 | pmovmskb %xmm2, %edi |
677 | |
678 | mov %cl, %ch |
679 | |
680 | mov %dh, %cl |
681 | mov $1, %edx |
682 | sal %cl, %edx |
683 | sub $1, %edx |
684 | |
685 | and %edx, %edi |
686 | |
687 | test %edi, %edi |
688 | jnz L(length_less16_part2_return) |
689 | |
690 | pcmpeqb (%eax), %xmm1 |
691 | pmovmskb %xmm1, %edi |
692 | |
693 | mov %ch, %cl |
694 | sar %cl, %edi |
695 | test %edi, %edi |
696 | jz L(ret_null) |
697 | |
698 | bsr %edi, %edi |
699 | add %edi, %eax |
700 | xor %ch, %ch |
701 | add %ecx, %eax |
702 | POP (%edi) |
703 | ret |
704 | |
705 | CFI_PUSH (%edi) |
706 | |
707 | .p2align 4 |
708 | L(length_less16_part2_return): |
709 | bsr %edi, %edi |
710 | lea 16(%eax, %edi), %eax |
711 | POP (%edi) |
712 | ret |
713 | |
714 | CFI_PUSH (%edi) |
715 | |
716 | .p2align 4 |
717 | L(ret_null): |
718 | xor %eax, %eax |
719 | POP (%edi) |
720 | ret |
721 | |
722 | END (__memrchr_sse2) |
723 | #endif |
724 | |