1 | /* Optimized memchr with sse2 without bsf |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # define CFI_PUSH(REG) \ |
24 | cfi_adjust_cfa_offset (4); \ |
25 | cfi_rel_offset (REG, 0) |
26 | |
27 | # define CFI_POP(REG) \ |
28 | cfi_adjust_cfa_offset (-4); \ |
29 | cfi_restore (REG) |
30 | |
31 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
32 | # define POP(REG) popl REG; CFI_POP (REG) |
33 | |
34 | # ifndef USE_AS_RAWMEMCHR |
35 | # define ENTRANCE PUSH(%edi); |
36 | # define PARMS 8 |
37 | # define RETURN POP(%edi); ret; CFI_PUSH(%edi); |
38 | # else |
39 | # define ENTRANCE |
40 | # define PARMS 4 |
41 | # endif |
42 | |
43 | # define STR1 PARMS |
44 | # define STR2 STR1+4 |
45 | |
46 | # ifndef USE_AS_RAWMEMCHR |
47 | # define LEN STR2+4 |
48 | # endif |
49 | |
50 | # ifndef MEMCHR |
51 | # define MEMCHR __memchr_sse2 |
52 | # endif |
53 | |
54 | atom_text_section |
55 | ENTRY (MEMCHR) |
56 | ENTRANCE |
57 | mov STR1(%esp), %ecx |
58 | movd STR2(%esp), %xmm1 |
59 | # ifndef USE_AS_RAWMEMCHR |
60 | mov LEN(%esp), %edx |
61 | test %edx, %edx |
62 | jz L(return_null) |
63 | # endif |
64 | |
65 | punpcklbw %xmm1, %xmm1 |
66 | # ifndef USE_AS_RAWMEMCHR |
67 | mov %ecx, %edi |
68 | # else |
69 | mov %ecx, %edx |
70 | # endif |
71 | punpcklbw %xmm1, %xmm1 |
72 | |
73 | and $63, %ecx |
74 | pshufd $0, %xmm1, %xmm1 |
75 | cmp $48, %ecx |
76 | ja L(crosscache) |
77 | |
78 | # ifndef USE_AS_RAWMEMCHR |
79 | movdqu (%edi), %xmm0 |
80 | # else |
81 | movdqu (%edx), %xmm0 |
82 | # endif |
83 | pcmpeqb %xmm1, %xmm0 |
84 | pmovmskb %xmm0, %eax |
85 | test %eax, %eax |
86 | # ifndef USE_AS_RAWMEMCHR |
87 | jnz L(match_case2_prolog) |
88 | |
89 | sub $16, %edx |
90 | jbe L(return_null) |
91 | lea 16(%edi), %edi |
92 | and $15, %ecx |
93 | and $-16, %edi |
94 | add %ecx, %edx |
95 | # else |
96 | jnz L(match_case1_prolog) |
97 | lea 16(%edx), %edx |
98 | and $-16, %edx |
99 | # endif |
100 | jmp L(loop_prolog) |
101 | |
102 | .p2align 4 |
103 | L(crosscache): |
104 | and $15, %ecx |
105 | # ifndef USE_AS_RAWMEMCHR |
106 | and $-16, %edi |
107 | movdqa (%edi), %xmm0 |
108 | # else |
109 | and $-16, %edx |
110 | movdqa (%edx), %xmm0 |
111 | # endif |
112 | pcmpeqb %xmm1, %xmm0 |
113 | pmovmskb %xmm0, %eax |
114 | sar %cl, %eax |
115 | test %eax, %eax |
116 | |
117 | # ifndef USE_AS_RAWMEMCHR |
118 | jnz L(match_case2_prolog1) |
119 | /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using |
120 | "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void |
121 | possible addition overflow. */ |
122 | neg %ecx |
123 | add $16, %ecx |
124 | sub %ecx, %edx |
125 | jbe L(return_null) |
126 | lea 16(%edi), %edi |
127 | # else |
128 | jnz L(match_case1_prolog1) |
129 | lea 16(%edx), %edx |
130 | # endif |
131 | |
132 | .p2align 4 |
133 | L(loop_prolog): |
134 | # ifndef USE_AS_RAWMEMCHR |
135 | sub $64, %edx |
136 | jbe L(exit_loop) |
137 | movdqa (%edi), %xmm0 |
138 | # else |
139 | movdqa (%edx), %xmm0 |
140 | # endif |
141 | pcmpeqb %xmm1, %xmm0 |
142 | xor %ecx, %ecx |
143 | pmovmskb %xmm0, %eax |
144 | test %eax, %eax |
145 | jnz L(match_case1) |
146 | |
147 | # ifndef USE_AS_RAWMEMCHR |
148 | movdqa 16(%edi), %xmm2 |
149 | # else |
150 | movdqa 16(%edx), %xmm2 |
151 | # endif |
152 | pcmpeqb %xmm1, %xmm2 |
153 | lea 16(%ecx), %ecx |
154 | pmovmskb %xmm2, %eax |
155 | test %eax, %eax |
156 | jnz L(match_case1) |
157 | |
158 | # ifndef USE_AS_RAWMEMCHR |
159 | movdqa 32(%edi), %xmm3 |
160 | # else |
161 | movdqa 32(%edx), %xmm3 |
162 | # endif |
163 | pcmpeqb %xmm1, %xmm3 |
164 | lea 16(%ecx), %ecx |
165 | pmovmskb %xmm3, %eax |
166 | test %eax, %eax |
167 | jnz L(match_case1) |
168 | |
169 | # ifndef USE_AS_RAWMEMCHR |
170 | movdqa 48(%edi), %xmm4 |
171 | # else |
172 | movdqa 48(%edx), %xmm4 |
173 | # endif |
174 | pcmpeqb %xmm1, %xmm4 |
175 | lea 16(%ecx), %ecx |
176 | pmovmskb %xmm4, %eax |
177 | test %eax, %eax |
178 | jnz L(match_case1) |
179 | |
180 | # ifndef USE_AS_RAWMEMCHR |
181 | lea 64(%edi), %edi |
182 | sub $64, %edx |
183 | jbe L(exit_loop) |
184 | |
185 | movdqa (%edi), %xmm0 |
186 | # else |
187 | lea 64(%edx), %edx |
188 | movdqa (%edx), %xmm0 |
189 | # endif |
190 | pcmpeqb %xmm1, %xmm0 |
191 | xor %ecx, %ecx |
192 | pmovmskb %xmm0, %eax |
193 | test %eax, %eax |
194 | jnz L(match_case1) |
195 | |
196 | # ifndef USE_AS_RAWMEMCHR |
197 | movdqa 16(%edi), %xmm2 |
198 | # else |
199 | movdqa 16(%edx), %xmm2 |
200 | # endif |
201 | pcmpeqb %xmm1, %xmm2 |
202 | lea 16(%ecx), %ecx |
203 | pmovmskb %xmm2, %eax |
204 | test %eax, %eax |
205 | jnz L(match_case1) |
206 | |
207 | # ifndef USE_AS_RAWMEMCHR |
208 | movdqa 32(%edi), %xmm3 |
209 | # else |
210 | movdqa 32(%edx), %xmm3 |
211 | # endif |
212 | pcmpeqb %xmm1, %xmm3 |
213 | lea 16(%ecx), %ecx |
214 | pmovmskb %xmm3, %eax |
215 | test %eax, %eax |
216 | jnz L(match_case1) |
217 | |
218 | # ifndef USE_AS_RAWMEMCHR |
219 | movdqa 48(%edi), %xmm4 |
220 | # else |
221 | movdqa 48(%edx), %xmm4 |
222 | # endif |
223 | pcmpeqb %xmm1, %xmm4 |
224 | lea 16(%ecx), %ecx |
225 | pmovmskb %xmm4, %eax |
226 | test %eax, %eax |
227 | jnz L(match_case1) |
228 | |
229 | # ifndef USE_AS_RAWMEMCHR |
230 | lea 64(%edi), %edi |
231 | mov %edi, %ecx |
232 | and $-64, %edi |
233 | and $63, %ecx |
234 | add %ecx, %edx |
235 | # else |
236 | lea 64(%edx), %edx |
237 | and $-64, %edx |
238 | # endif |
239 | |
240 | .p2align 4 |
241 | L(align64_loop): |
242 | |
243 | # ifndef USE_AS_RAWMEMCHR |
244 | sub $64, %edx |
245 | jbe L(exit_loop) |
246 | movdqa (%edi), %xmm0 |
247 | movdqa 16(%edi), %xmm2 |
248 | movdqa 32(%edi), %xmm3 |
249 | movdqa 48(%edi), %xmm4 |
250 | # else |
251 | movdqa (%edx), %xmm0 |
252 | movdqa 16(%edx), %xmm2 |
253 | movdqa 32(%edx), %xmm3 |
254 | movdqa 48(%edx), %xmm4 |
255 | # endif |
256 | pcmpeqb %xmm1, %xmm0 |
257 | pcmpeqb %xmm1, %xmm2 |
258 | pcmpeqb %xmm1, %xmm3 |
259 | pcmpeqb %xmm1, %xmm4 |
260 | |
261 | pmaxub %xmm0, %xmm3 |
262 | pmaxub %xmm2, %xmm4 |
263 | pmaxub %xmm3, %xmm4 |
264 | # ifndef USE_AS_RAWMEMCHR |
265 | add $64, %edi |
266 | # else |
267 | add $64, %edx |
268 | # endif |
269 | pmovmskb %xmm4, %eax |
270 | |
271 | test %eax, %eax |
272 | jz L(align64_loop) |
273 | |
274 | # ifndef USE_AS_RAWMEMCHR |
275 | sub $64, %edi |
276 | # else |
277 | sub $64, %edx |
278 | # endif |
279 | |
280 | pmovmskb %xmm0, %eax |
281 | xor %ecx, %ecx |
282 | test %eax, %eax |
283 | jnz L(match_case1) |
284 | |
285 | pmovmskb %xmm2, %eax |
286 | lea 16(%ecx), %ecx |
287 | test %eax, %eax |
288 | jnz L(match_case1) |
289 | |
290 | # ifndef USE_AS_RAWMEMCHR |
291 | movdqa 32(%edi), %xmm3 |
292 | # else |
293 | movdqa 32(%edx), %xmm3 |
294 | # endif |
295 | pcmpeqb %xmm1, %xmm3 |
296 | pmovmskb %xmm3, %eax |
297 | lea 16(%ecx), %ecx |
298 | test %eax, %eax |
299 | jnz L(match_case1) |
300 | |
301 | # ifndef USE_AS_RAWMEMCHR |
302 | pcmpeqb 48(%edi), %xmm1 |
303 | # else |
304 | pcmpeqb 48(%edx), %xmm1 |
305 | # endif |
306 | pmovmskb %xmm1, %eax |
307 | lea 16(%ecx), %ecx |
308 | |
309 | .p2align 4 |
310 | L(match_case1): |
311 | # ifndef USE_AS_RAWMEMCHR |
312 | add %ecx, %edi |
313 | # else |
314 | L(match_case1_prolog1): |
315 | add %ecx, %edx |
316 | L(match_case1_prolog): |
317 | # endif |
318 | test %al, %al |
319 | jz L(match_case1_high) |
320 | mov %al, %cl |
321 | and $15, %cl |
322 | jz L(match_case1_8) |
323 | test $0x01, %al |
324 | jnz L(ExitCase1_1) |
325 | test $0x02, %al |
326 | jnz L(ExitCase1_2) |
327 | test $0x04, %al |
328 | jnz L(ExitCase1_3) |
329 | # ifndef USE_AS_RAWMEMCHR |
330 | lea 3(%edi), %eax |
331 | RETURN |
332 | # else |
333 | lea 3(%edx), %eax |
334 | ret |
335 | # endif |
336 | |
337 | .p2align 4 |
338 | L(match_case1_8): |
339 | test $0x10, %al |
340 | jnz L(ExitCase1_5) |
341 | test $0x20, %al |
342 | jnz L(ExitCase1_6) |
343 | test $0x40, %al |
344 | jnz L(ExitCase1_7) |
345 | # ifndef USE_AS_RAWMEMCHR |
346 | lea 7(%edi), %eax |
347 | RETURN |
348 | # else |
349 | lea 7(%edx), %eax |
350 | ret |
351 | # endif |
352 | |
353 | .p2align 4 |
354 | L(match_case1_high): |
355 | mov %ah, %ch |
356 | and $15, %ch |
357 | jz L(match_case1_high_8) |
358 | test $0x01, %ah |
359 | jnz L(ExitCase1_9) |
360 | test $0x02, %ah |
361 | jnz L(ExitCase1_10) |
362 | test $0x04, %ah |
363 | jnz L(ExitCase1_11) |
364 | # ifndef USE_AS_RAWMEMCHR |
365 | lea 11(%edi), %eax |
366 | RETURN |
367 | # else |
368 | lea 11(%edx), %eax |
369 | ret |
370 | # endif |
371 | |
372 | .p2align 4 |
373 | L(match_case1_high_8): |
374 | test $0x10, %ah |
375 | jnz L(ExitCase1_13) |
376 | test $0x20, %ah |
377 | jnz L(ExitCase1_14) |
378 | test $0x40, %ah |
379 | jnz L(ExitCase1_15) |
380 | # ifndef USE_AS_RAWMEMCHR |
381 | lea 15(%edi), %eax |
382 | RETURN |
383 | # else |
384 | lea 15(%edx), %eax |
385 | ret |
386 | # endif |
387 | |
388 | # ifndef USE_AS_RAWMEMCHR |
389 | .p2align 4 |
390 | L(exit_loop): |
391 | add $64, %edx |
392 | |
393 | movdqa (%edi), %xmm0 |
394 | pcmpeqb %xmm1, %xmm0 |
395 | xor %ecx, %ecx |
396 | pmovmskb %xmm0, %eax |
397 | test %eax, %eax |
398 | jnz L(match_case2) |
399 | cmp $16, %edx |
400 | jbe L(return_null) |
401 | |
402 | movdqa 16(%edi), %xmm2 |
403 | pcmpeqb %xmm1, %xmm2 |
404 | lea 16(%ecx), %ecx |
405 | pmovmskb %xmm2, %eax |
406 | test %eax, %eax |
407 | jnz L(match_case2) |
408 | cmp $32, %edx |
409 | jbe L(return_null) |
410 | |
411 | movdqa 32(%edi), %xmm3 |
412 | pcmpeqb %xmm1, %xmm3 |
413 | lea 16(%ecx), %ecx |
414 | pmovmskb %xmm3, %eax |
415 | test %eax, %eax |
416 | jnz L(match_case2) |
417 | cmp $48, %edx |
418 | jbe L(return_null) |
419 | |
420 | pcmpeqb 48(%edi), %xmm1 |
421 | lea 16(%ecx), %ecx |
422 | pmovmskb %xmm1, %eax |
423 | test %eax, %eax |
424 | jnz L(match_case2) |
425 | |
426 | xor %eax, %eax |
427 | RETURN |
428 | # endif |
429 | |
430 | .p2align 4 |
431 | L(ExitCase1_1): |
432 | # ifndef USE_AS_RAWMEMCHR |
433 | mov %edi, %eax |
434 | RETURN |
435 | # else |
436 | mov %edx, %eax |
437 | ret |
438 | # endif |
439 | |
440 | .p2align 4 |
441 | L(ExitCase1_2): |
442 | # ifndef USE_AS_RAWMEMCHR |
443 | lea 1(%edi), %eax |
444 | RETURN |
445 | # else |
446 | lea 1(%edx), %eax |
447 | ret |
448 | # endif |
449 | |
450 | .p2align 4 |
451 | L(ExitCase1_3): |
452 | # ifndef USE_AS_RAWMEMCHR |
453 | lea 2(%edi), %eax |
454 | RETURN |
455 | # else |
456 | lea 2(%edx), %eax |
457 | ret |
458 | # endif |
459 | |
460 | .p2align 4 |
461 | L(ExitCase1_5): |
462 | # ifndef USE_AS_RAWMEMCHR |
463 | lea 4(%edi), %eax |
464 | RETURN |
465 | # else |
466 | lea 4(%edx), %eax |
467 | ret |
468 | # endif |
469 | |
470 | .p2align 4 |
471 | L(ExitCase1_6): |
472 | # ifndef USE_AS_RAWMEMCHR |
473 | lea 5(%edi), %eax |
474 | RETURN |
475 | # else |
476 | lea 5(%edx), %eax |
477 | ret |
478 | # endif |
479 | |
480 | .p2align 4 |
481 | L(ExitCase1_7): |
482 | # ifndef USE_AS_RAWMEMCHR |
483 | lea 6(%edi), %eax |
484 | RETURN |
485 | # else |
486 | lea 6(%edx), %eax |
487 | ret |
488 | # endif |
489 | |
490 | .p2align 4 |
491 | L(ExitCase1_9): |
492 | # ifndef USE_AS_RAWMEMCHR |
493 | lea 8(%edi), %eax |
494 | RETURN |
495 | # else |
496 | lea 8(%edx), %eax |
497 | ret |
498 | # endif |
499 | |
500 | .p2align 4 |
501 | L(ExitCase1_10): |
502 | # ifndef USE_AS_RAWMEMCHR |
503 | lea 9(%edi), %eax |
504 | RETURN |
505 | # else |
506 | lea 9(%edx), %eax |
507 | ret |
508 | # endif |
509 | |
510 | .p2align 4 |
511 | L(ExitCase1_11): |
512 | # ifndef USE_AS_RAWMEMCHR |
513 | lea 10(%edi), %eax |
514 | RETURN |
515 | # else |
516 | lea 10(%edx), %eax |
517 | ret |
518 | # endif |
519 | |
520 | .p2align 4 |
521 | L(ExitCase1_13): |
522 | # ifndef USE_AS_RAWMEMCHR |
523 | lea 12(%edi), %eax |
524 | RETURN |
525 | # else |
526 | lea 12(%edx), %eax |
527 | ret |
528 | # endif |
529 | |
530 | .p2align 4 |
531 | L(ExitCase1_14): |
532 | # ifndef USE_AS_RAWMEMCHR |
533 | lea 13(%edi), %eax |
534 | RETURN |
535 | # else |
536 | lea 13(%edx), %eax |
537 | ret |
538 | # endif |
539 | |
540 | .p2align 4 |
541 | L(ExitCase1_15): |
542 | # ifndef USE_AS_RAWMEMCHR |
543 | lea 14(%edi), %eax |
544 | RETURN |
545 | # else |
546 | lea 14(%edx), %eax |
547 | ret |
548 | # endif |
549 | |
550 | # ifndef USE_AS_RAWMEMCHR |
551 | .p2align 4 |
552 | L(match_case2): |
553 | sub %ecx, %edx |
554 | L(match_case2_prolog1): |
555 | add %ecx, %edi |
556 | L(match_case2_prolog): |
557 | test %al, %al |
558 | jz L(match_case2_high) |
559 | mov %al, %cl |
560 | and $15, %cl |
561 | jz L(match_case2_8) |
562 | test $0x01, %al |
563 | jnz L(ExitCase2_1) |
564 | test $0x02, %al |
565 | jnz L(ExitCase2_2) |
566 | test $0x04, %al |
567 | jnz L(ExitCase2_3) |
568 | sub $4, %edx |
569 | jb L(return_null) |
570 | lea 3(%edi), %eax |
571 | RETURN |
572 | |
573 | .p2align 4 |
574 | L(match_case2_8): |
575 | test $0x10, %al |
576 | jnz L(ExitCase2_5) |
577 | test $0x20, %al |
578 | jnz L(ExitCase2_6) |
579 | test $0x40, %al |
580 | jnz L(ExitCase2_7) |
581 | sub $8, %edx |
582 | jb L(return_null) |
583 | lea 7(%edi), %eax |
584 | RETURN |
585 | |
586 | .p2align 4 |
587 | L(match_case2_high): |
588 | mov %ah, %ch |
589 | and $15, %ch |
590 | jz L(match_case2_high_8) |
591 | test $0x01, %ah |
592 | jnz L(ExitCase2_9) |
593 | test $0x02, %ah |
594 | jnz L(ExitCase2_10) |
595 | test $0x04, %ah |
596 | jnz L(ExitCase2_11) |
597 | sub $12, %edx |
598 | jb L(return_null) |
599 | lea 11(%edi), %eax |
600 | RETURN |
601 | |
602 | .p2align 4 |
603 | L(match_case2_high_8): |
604 | test $0x10, %ah |
605 | jnz L(ExitCase2_13) |
606 | test $0x20, %ah |
607 | jnz L(ExitCase2_14) |
608 | test $0x40, %ah |
609 | jnz L(ExitCase2_15) |
610 | sub $16, %edx |
611 | jb L(return_null) |
612 | lea 15(%edi), %eax |
613 | RETURN |
614 | |
615 | .p2align 4 |
616 | L(ExitCase2_1): |
617 | mov %edi, %eax |
618 | RETURN |
619 | |
620 | .p2align 4 |
621 | L(ExitCase2_2): |
622 | sub $2, %edx |
623 | jb L(return_null) |
624 | lea 1(%edi), %eax |
625 | RETURN |
626 | |
627 | .p2align 4 |
628 | L(ExitCase2_3): |
629 | sub $3, %edx |
630 | jb L(return_null) |
631 | lea 2(%edi), %eax |
632 | RETURN |
633 | |
634 | .p2align 4 |
635 | L(ExitCase2_5): |
636 | sub $5, %edx |
637 | jb L(return_null) |
638 | lea 4(%edi), %eax |
639 | RETURN |
640 | |
641 | .p2align 4 |
642 | L(ExitCase2_6): |
643 | sub $6, %edx |
644 | jb L(return_null) |
645 | lea 5(%edi), %eax |
646 | RETURN |
647 | |
648 | .p2align 4 |
649 | L(ExitCase2_7): |
650 | sub $7, %edx |
651 | jb L(return_null) |
652 | lea 6(%edi), %eax |
653 | RETURN |
654 | |
655 | .p2align 4 |
656 | L(ExitCase2_9): |
657 | sub $9, %edx |
658 | jb L(return_null) |
659 | lea 8(%edi), %eax |
660 | RETURN |
661 | |
662 | .p2align 4 |
663 | L(ExitCase2_10): |
664 | sub $10, %edx |
665 | jb L(return_null) |
666 | lea 9(%edi), %eax |
667 | RETURN |
668 | |
669 | .p2align 4 |
670 | L(ExitCase2_11): |
671 | sub $11, %edx |
672 | jb L(return_null) |
673 | lea 10(%edi), %eax |
674 | RETURN |
675 | |
676 | .p2align 4 |
677 | L(ExitCase2_13): |
678 | sub $13, %edx |
679 | jb L(return_null) |
680 | lea 12(%edi), %eax |
681 | RETURN |
682 | |
683 | .p2align 4 |
684 | L(ExitCase2_14): |
685 | sub $14, %edx |
686 | jb L(return_null) |
687 | lea 13(%edi), %eax |
688 | RETURN |
689 | |
690 | .p2align 4 |
691 | L(ExitCase2_15): |
692 | sub $15, %edx |
693 | jb L(return_null) |
694 | lea 14(%edi), %eax |
695 | RETURN |
696 | # endif |
697 | |
698 | .p2align 4 |
699 | L(return_null): |
700 | xor %eax, %eax |
701 | # ifndef USE_AS_RAWMEMCHR |
702 | RETURN |
703 | # else |
704 | ret |
705 | # endif |
706 | |
707 | END (MEMCHR) |
708 | #endif |
709 | |