1 | /* strlen with SSE2 |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ |
20 | |
21 | #if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc) |
22 | |
23 | # ifndef USE_AS_STRCAT |
24 | |
25 | # include <sysdep.h> |
26 | # define PARMS 4 |
27 | # define STR PARMS |
28 | # define RETURN ret |
29 | |
30 | # ifdef USE_AS_STRNLEN |
31 | # define LEN PARMS + 8 |
32 | # define CFI_PUSH(REG) \ |
33 | cfi_adjust_cfa_offset (4); \ |
34 | cfi_rel_offset (REG, 0) |
35 | |
36 | # define CFI_POP(REG) \ |
37 | cfi_adjust_cfa_offset (-4); \ |
38 | cfi_restore (REG) |
39 | |
40 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
41 | # define POP(REG) popl REG; CFI_POP (REG) |
42 | # undef RETURN |
43 | # define RETURN POP (%edi); CFI_PUSH(%edi); ret |
44 | # endif |
45 | |
46 | # ifndef STRLEN |
47 | # define STRLEN __strlen_sse2 |
48 | # endif |
49 | |
50 | atom_text_section |
51 | ENTRY (STRLEN) |
52 | mov STR(%esp), %edx |
53 | # ifdef USE_AS_STRNLEN |
54 | PUSH (%edi) |
55 | movl LEN(%esp), %edi |
56 | sub $4, %edi |
57 | jbe L(len_less4_prolog) |
58 | # endif |
59 | # endif |
60 | xor %eax, %eax |
61 | cmpb $0, (%edx) |
62 | jz L(exit_tail0) |
63 | cmpb $0, 1(%edx) |
64 | jz L(exit_tail1) |
65 | cmpb $0, 2(%edx) |
66 | jz L(exit_tail2) |
67 | cmpb $0, 3(%edx) |
68 | jz L(exit_tail3) |
69 | |
70 | # ifdef USE_AS_STRNLEN |
71 | sub $4, %edi |
72 | jbe L(len_less8_prolog) |
73 | # endif |
74 | |
75 | cmpb $0, 4(%edx) |
76 | jz L(exit_tail4) |
77 | cmpb $0, 5(%edx) |
78 | jz L(exit_tail5) |
79 | cmpb $0, 6(%edx) |
80 | jz L(exit_tail6) |
81 | cmpb $0, 7(%edx) |
82 | jz L(exit_tail7) |
83 | |
84 | # ifdef USE_AS_STRNLEN |
85 | sub $4, %edi |
86 | jbe L(len_less12_prolog) |
87 | # endif |
88 | |
89 | cmpb $0, 8(%edx) |
90 | jz L(exit_tail8) |
91 | cmpb $0, 9(%edx) |
92 | jz L(exit_tail9) |
93 | cmpb $0, 10(%edx) |
94 | jz L(exit_tail10) |
95 | cmpb $0, 11(%edx) |
96 | jz L(exit_tail11) |
97 | |
98 | # ifdef USE_AS_STRNLEN |
99 | sub $4, %edi |
100 | jbe L(len_less16_prolog) |
101 | # endif |
102 | |
103 | cmpb $0, 12(%edx) |
104 | jz L(exit_tail12) |
105 | cmpb $0, 13(%edx) |
106 | jz L(exit_tail13) |
107 | cmpb $0, 14(%edx) |
108 | jz L(exit_tail14) |
109 | cmpb $0, 15(%edx) |
110 | jz L(exit_tail15) |
111 | |
112 | pxor %xmm0, %xmm0 |
113 | lea 16(%edx), %eax |
114 | mov %eax, %ecx |
115 | and $-16, %eax |
116 | |
117 | # ifdef USE_AS_STRNLEN |
118 | and $15, %edx |
119 | add %edx, %edi |
120 | sub $64, %edi |
121 | jbe L(len_less64) |
122 | # endif |
123 | |
124 | pcmpeqb (%eax), %xmm0 |
125 | pmovmskb %xmm0, %edx |
126 | pxor %xmm1, %xmm1 |
127 | test %edx, %edx |
128 | lea 16(%eax), %eax |
129 | jnz L(exit) |
130 | |
131 | pcmpeqb (%eax), %xmm1 |
132 | pmovmskb %xmm1, %edx |
133 | pxor %xmm2, %xmm2 |
134 | test %edx, %edx |
135 | lea 16(%eax), %eax |
136 | jnz L(exit) |
137 | |
138 | pcmpeqb (%eax), %xmm2 |
139 | pmovmskb %xmm2, %edx |
140 | pxor %xmm3, %xmm3 |
141 | test %edx, %edx |
142 | lea 16(%eax), %eax |
143 | jnz L(exit) |
144 | |
145 | pcmpeqb (%eax), %xmm3 |
146 | pmovmskb %xmm3, %edx |
147 | test %edx, %edx |
148 | lea 16(%eax), %eax |
149 | jnz L(exit) |
150 | |
151 | # ifdef USE_AS_STRNLEN |
152 | sub $64, %edi |
153 | jbe L(len_less64) |
154 | # endif |
155 | |
156 | pcmpeqb (%eax), %xmm0 |
157 | pmovmskb %xmm0, %edx |
158 | test %edx, %edx |
159 | lea 16(%eax), %eax |
160 | jnz L(exit) |
161 | |
162 | pcmpeqb (%eax), %xmm1 |
163 | pmovmskb %xmm1, %edx |
164 | test %edx, %edx |
165 | lea 16(%eax), %eax |
166 | jnz L(exit) |
167 | |
168 | pcmpeqb (%eax), %xmm2 |
169 | pmovmskb %xmm2, %edx |
170 | test %edx, %edx |
171 | lea 16(%eax), %eax |
172 | jnz L(exit) |
173 | |
174 | pcmpeqb (%eax), %xmm3 |
175 | pmovmskb %xmm3, %edx |
176 | test %edx, %edx |
177 | lea 16(%eax), %eax |
178 | jnz L(exit) |
179 | |
180 | # ifdef USE_AS_STRNLEN |
181 | sub $64, %edi |
182 | jbe L(len_less64) |
183 | # endif |
184 | |
185 | pcmpeqb (%eax), %xmm0 |
186 | pmovmskb %xmm0, %edx |
187 | test %edx, %edx |
188 | lea 16(%eax), %eax |
189 | jnz L(exit) |
190 | |
191 | pcmpeqb (%eax), %xmm1 |
192 | pmovmskb %xmm1, %edx |
193 | test %edx, %edx |
194 | lea 16(%eax), %eax |
195 | jnz L(exit) |
196 | |
197 | pcmpeqb (%eax), %xmm2 |
198 | pmovmskb %xmm2, %edx |
199 | test %edx, %edx |
200 | lea 16(%eax), %eax |
201 | jnz L(exit) |
202 | |
203 | pcmpeqb (%eax), %xmm3 |
204 | pmovmskb %xmm3, %edx |
205 | test %edx, %edx |
206 | lea 16(%eax), %eax |
207 | jnz L(exit) |
208 | |
209 | # ifdef USE_AS_STRNLEN |
210 | sub $64, %edi |
211 | jbe L(len_less64) |
212 | # endif |
213 | |
214 | pcmpeqb (%eax), %xmm0 |
215 | pmovmskb %xmm0, %edx |
216 | test %edx, %edx |
217 | lea 16(%eax), %eax |
218 | jnz L(exit) |
219 | |
220 | pcmpeqb (%eax), %xmm1 |
221 | pmovmskb %xmm1, %edx |
222 | test %edx, %edx |
223 | lea 16(%eax), %eax |
224 | jnz L(exit) |
225 | |
226 | pcmpeqb (%eax), %xmm2 |
227 | pmovmskb %xmm2, %edx |
228 | test %edx, %edx |
229 | lea 16(%eax), %eax |
230 | jnz L(exit) |
231 | |
232 | pcmpeqb (%eax), %xmm3 |
233 | pmovmskb %xmm3, %edx |
234 | test %edx, %edx |
235 | lea 16(%eax), %eax |
236 | jnz L(exit) |
237 | |
238 | # ifdef USE_AS_STRNLEN |
239 | mov %eax, %edx |
240 | and $63, %edx |
241 | add %edx, %edi |
242 | # endif |
243 | |
244 | and $-0x40, %eax |
245 | |
246 | .p2align 4 |
247 | L(aligned_64_loop): |
248 | # ifdef USE_AS_STRNLEN |
249 | sub $64, %edi |
250 | jbe L(len_less64) |
251 | # endif |
252 | movaps (%eax), %xmm0 |
253 | movaps 16(%eax), %xmm1 |
254 | movaps 32(%eax), %xmm2 |
255 | movaps 48(%eax), %xmm6 |
256 | pminub %xmm1, %xmm0 |
257 | pminub %xmm6, %xmm2 |
258 | pminub %xmm0, %xmm2 |
259 | pcmpeqb %xmm3, %xmm2 |
260 | pmovmskb %xmm2, %edx |
261 | test %edx, %edx |
262 | lea 64(%eax), %eax |
263 | jz L(aligned_64_loop) |
264 | |
265 | pcmpeqb -64(%eax), %xmm3 |
266 | pmovmskb %xmm3, %edx |
267 | test %edx, %edx |
268 | lea 48(%ecx), %ecx |
269 | jnz L(exit) |
270 | |
271 | pcmpeqb %xmm1, %xmm3 |
272 | pmovmskb %xmm3, %edx |
273 | test %edx, %edx |
274 | lea -16(%ecx), %ecx |
275 | jnz L(exit) |
276 | |
277 | pcmpeqb -32(%eax), %xmm3 |
278 | pmovmskb %xmm3, %edx |
279 | test %edx, %edx |
280 | lea -16(%ecx), %ecx |
281 | jnz L(exit) |
282 | |
283 | pcmpeqb %xmm6, %xmm3 |
284 | pmovmskb %xmm3, %edx |
285 | lea -16(%ecx), %ecx |
286 | L(exit): |
287 | sub %ecx, %eax |
288 | test %dl, %dl |
289 | jz L(exit_high) |
290 | |
291 | mov %dl, %cl |
292 | and $15, %cl |
293 | jz L(exit_8) |
294 | test $0x01, %dl |
295 | jnz L(exit_tail0) |
296 | test $0x02, %dl |
297 | jnz L(exit_tail1) |
298 | test $0x04, %dl |
299 | jnz L(exit_tail2) |
300 | add $3, %eax |
301 | RETURN |
302 | |
303 | .p2align 4 |
304 | L(exit_8): |
305 | test $0x10, %dl |
306 | jnz L(exit_tail4) |
307 | test $0x20, %dl |
308 | jnz L(exit_tail5) |
309 | test $0x40, %dl |
310 | jnz L(exit_tail6) |
311 | add $7, %eax |
312 | RETURN |
313 | |
314 | .p2align 4 |
315 | L(exit_high): |
316 | mov %dh, %ch |
317 | and $15, %ch |
318 | jz L(exit_high_8) |
319 | test $0x01, %dh |
320 | jnz L(exit_tail8) |
321 | test $0x02, %dh |
322 | jnz L(exit_tail9) |
323 | test $0x04, %dh |
324 | jnz L(exit_tail10) |
325 | add $11, %eax |
326 | RETURN |
327 | |
328 | .p2align 4 |
329 | L(exit_high_8): |
330 | test $0x10, %dh |
331 | jnz L(exit_tail12) |
332 | test $0x20, %dh |
333 | jnz L(exit_tail13) |
334 | test $0x40, %dh |
335 | jnz L(exit_tail14) |
336 | add $15, %eax |
337 | L(exit_tail0): |
338 | RETURN |
339 | |
340 | # ifdef USE_AS_STRNLEN |
341 | |
342 | .p2align 4 |
343 | L(len_less64): |
344 | pxor %xmm0, %xmm0 |
345 | add $64, %edi |
346 | |
347 | pcmpeqb (%eax), %xmm0 |
348 | pmovmskb %xmm0, %edx |
349 | pxor %xmm1, %xmm1 |
350 | lea 16(%eax), %eax |
351 | test %edx, %edx |
352 | jnz L(strnlen_exit) |
353 | |
354 | sub $16, %edi |
355 | jbe L(return_start_len) |
356 | |
357 | pcmpeqb (%eax), %xmm1 |
358 | pmovmskb %xmm1, %edx |
359 | lea 16(%eax), %eax |
360 | test %edx, %edx |
361 | jnz L(strnlen_exit) |
362 | |
363 | sub $16, %edi |
364 | jbe L(return_start_len) |
365 | |
366 | pcmpeqb (%eax), %xmm0 |
367 | pmovmskb %xmm0, %edx |
368 | lea 16(%eax), %eax |
369 | test %edx, %edx |
370 | jnz L(strnlen_exit) |
371 | |
372 | sub $16, %edi |
373 | jbe L(return_start_len) |
374 | |
375 | pcmpeqb (%eax), %xmm1 |
376 | pmovmskb %xmm1, %edx |
377 | lea 16(%eax), %eax |
378 | test %edx, %edx |
379 | jnz L(strnlen_exit) |
380 | |
381 | movl LEN(%esp), %eax |
382 | RETURN |
383 | |
384 | .p2align 4 |
385 | L(strnlen_exit): |
386 | sub %ecx, %eax |
387 | |
388 | test %dl, %dl |
389 | jz L(strnlen_exit_high) |
390 | mov %dl, %cl |
391 | and $15, %cl |
392 | jz L(strnlen_exit_8) |
393 | test $0x01, %dl |
394 | jnz L(exit_tail0) |
395 | test $0x02, %dl |
396 | jnz L(strnlen_exit_tail1) |
397 | test $0x04, %dl |
398 | jnz L(strnlen_exit_tail2) |
399 | sub $4, %edi |
400 | jb L(return_start_len) |
401 | lea 3(%eax), %eax |
402 | RETURN |
403 | |
404 | .p2align 4 |
405 | L(strnlen_exit_8): |
406 | test $0x10, %dl |
407 | jnz L(strnlen_exit_tail4) |
408 | test $0x20, %dl |
409 | jnz L(strnlen_exit_tail5) |
410 | test $0x40, %dl |
411 | jnz L(strnlen_exit_tail6) |
412 | sub $8, %edi |
413 | jb L(return_start_len) |
414 | lea 7(%eax), %eax |
415 | RETURN |
416 | |
417 | .p2align 4 |
418 | L(strnlen_exit_high): |
419 | mov %dh, %ch |
420 | and $15, %ch |
421 | jz L(strnlen_exit_high_8) |
422 | test $0x01, %dh |
423 | jnz L(strnlen_exit_tail8) |
424 | test $0x02, %dh |
425 | jnz L(strnlen_exit_tail9) |
426 | test $0x04, %dh |
427 | jnz L(strnlen_exit_tail10) |
428 | sub $12, %edi |
429 | jb L(return_start_len) |
430 | lea 11(%eax), %eax |
431 | RETURN |
432 | |
433 | .p2align 4 |
434 | L(strnlen_exit_high_8): |
435 | test $0x10, %dh |
436 | jnz L(strnlen_exit_tail12) |
437 | test $0x20, %dh |
438 | jnz L(strnlen_exit_tail13) |
439 | test $0x40, %dh |
440 | jnz L(strnlen_exit_tail14) |
441 | sub $16, %edi |
442 | jb L(return_start_len) |
443 | lea 15(%eax), %eax |
444 | RETURN |
445 | |
446 | .p2align 4 |
447 | L(strnlen_exit_tail1): |
448 | sub $2, %edi |
449 | jb L(return_start_len) |
450 | lea 1(%eax), %eax |
451 | RETURN |
452 | |
453 | .p2align 4 |
454 | L(strnlen_exit_tail2): |
455 | sub $3, %edi |
456 | jb L(return_start_len) |
457 | lea 2(%eax), %eax |
458 | RETURN |
459 | |
460 | .p2align 4 |
461 | L(strnlen_exit_tail4): |
462 | sub $5, %edi |
463 | jb L(return_start_len) |
464 | lea 4(%eax), %eax |
465 | RETURN |
466 | |
467 | .p2align 4 |
468 | L(strnlen_exit_tail5): |
469 | sub $6, %edi |
470 | jb L(return_start_len) |
471 | lea 5(%eax), %eax |
472 | RETURN |
473 | |
474 | .p2align 4 |
475 | L(strnlen_exit_tail6): |
476 | sub $7, %edi |
477 | jb L(return_start_len) |
478 | lea 6(%eax), %eax |
479 | RETURN |
480 | |
481 | .p2align 4 |
482 | L(strnlen_exit_tail8): |
483 | sub $9, %edi |
484 | jb L(return_start_len) |
485 | lea 8(%eax), %eax |
486 | RETURN |
487 | |
488 | .p2align 4 |
489 | L(strnlen_exit_tail9): |
490 | sub $10, %edi |
491 | jb L(return_start_len) |
492 | lea 9(%eax), %eax |
493 | RETURN |
494 | |
495 | .p2align 4 |
496 | L(strnlen_exit_tail10): |
497 | sub $11, %edi |
498 | jb L(return_start_len) |
499 | lea 10(%eax), %eax |
500 | RETURN |
501 | |
502 | .p2align 4 |
503 | L(strnlen_exit_tail12): |
504 | sub $13, %edi |
505 | jb L(return_start_len) |
506 | lea 12(%eax), %eax |
507 | RETURN |
508 | |
509 | .p2align 4 |
510 | L(strnlen_exit_tail13): |
511 | sub $14, %edi |
512 | jb L(return_start_len) |
513 | lea 13(%eax), %eax |
514 | RETURN |
515 | |
516 | .p2align 4 |
517 | L(strnlen_exit_tail14): |
518 | sub $15, %edi |
519 | jb L(return_start_len) |
520 | lea 14(%eax), %eax |
521 | RETURN |
522 | |
523 | .p2align 4 |
524 | L(return_start_len): |
525 | movl LEN(%esp), %eax |
526 | RETURN |
527 | |
528 | /* for prolog only */ |
529 | |
530 | .p2align 4 |
531 | L(len_less4_prolog): |
532 | xor %eax, %eax |
533 | |
534 | add $4, %edi |
535 | jz L(exit_tail0) |
536 | |
537 | cmpb $0, (%edx) |
538 | jz L(exit_tail0) |
539 | cmp $1, %edi |
540 | je L(exit_tail1) |
541 | |
542 | cmpb $0, 1(%edx) |
543 | jz L(exit_tail1) |
544 | cmp $2, %edi |
545 | je L(exit_tail2) |
546 | |
547 | cmpb $0, 2(%edx) |
548 | jz L(exit_tail2) |
549 | cmp $3, %edi |
550 | je L(exit_tail3) |
551 | |
552 | cmpb $0, 3(%edx) |
553 | jz L(exit_tail3) |
554 | mov $4, %eax |
555 | RETURN |
556 | |
557 | .p2align 4 |
558 | L(len_less8_prolog): |
559 | add $4, %edi |
560 | |
561 | cmpb $0, 4(%edx) |
562 | jz L(exit_tail4) |
563 | cmp $1, %edi |
564 | je L(exit_tail5) |
565 | |
566 | cmpb $0, 5(%edx) |
567 | jz L(exit_tail5) |
568 | cmp $2, %edi |
569 | je L(exit_tail6) |
570 | |
571 | cmpb $0, 6(%edx) |
572 | jz L(exit_tail6) |
573 | cmp $3, %edi |
574 | je L(exit_tail7) |
575 | |
576 | cmpb $0, 7(%edx) |
577 | jz L(exit_tail7) |
578 | mov $8, %eax |
579 | RETURN |
580 | |
581 | |
582 | .p2align 4 |
583 | L(len_less12_prolog): |
584 | add $4, %edi |
585 | |
586 | cmpb $0, 8(%edx) |
587 | jz L(exit_tail8) |
588 | cmp $1, %edi |
589 | je L(exit_tail9) |
590 | |
591 | cmpb $0, 9(%edx) |
592 | jz L(exit_tail9) |
593 | cmp $2, %edi |
594 | je L(exit_tail10) |
595 | |
596 | cmpb $0, 10(%edx) |
597 | jz L(exit_tail10) |
598 | cmp $3, %edi |
599 | je L(exit_tail11) |
600 | |
601 | cmpb $0, 11(%edx) |
602 | jz L(exit_tail11) |
603 | mov $12, %eax |
604 | RETURN |
605 | |
606 | .p2align 4 |
607 | L(len_less16_prolog): |
608 | add $4, %edi |
609 | |
610 | cmpb $0, 12(%edx) |
611 | jz L(exit_tail12) |
612 | cmp $1, %edi |
613 | je L(exit_tail13) |
614 | |
615 | cmpb $0, 13(%edx) |
616 | jz L(exit_tail13) |
617 | cmp $2, %edi |
618 | je L(exit_tail14) |
619 | |
620 | cmpb $0, 14(%edx) |
621 | jz L(exit_tail14) |
622 | cmp $3, %edi |
623 | je L(exit_tail15) |
624 | |
625 | cmpb $0, 15(%edx) |
626 | jz L(exit_tail15) |
627 | mov $16, %eax |
628 | RETURN |
629 | # endif |
630 | |
631 | .p2align 4 |
632 | L(exit_tail1): |
633 | add $1, %eax |
634 | RETURN |
635 | |
636 | L(exit_tail2): |
637 | add $2, %eax |
638 | RETURN |
639 | |
640 | L(exit_tail3): |
641 | add $3, %eax |
642 | RETURN |
643 | |
644 | L(exit_tail4): |
645 | add $4, %eax |
646 | RETURN |
647 | |
648 | L(exit_tail5): |
649 | add $5, %eax |
650 | RETURN |
651 | |
652 | L(exit_tail6): |
653 | add $6, %eax |
654 | RETURN |
655 | |
656 | L(exit_tail7): |
657 | add $7, %eax |
658 | RETURN |
659 | |
660 | L(exit_tail8): |
661 | add $8, %eax |
662 | RETURN |
663 | |
664 | L(exit_tail9): |
665 | add $9, %eax |
666 | RETURN |
667 | |
668 | L(exit_tail10): |
669 | add $10, %eax |
670 | RETURN |
671 | |
672 | L(exit_tail11): |
673 | add $11, %eax |
674 | RETURN |
675 | |
676 | L(exit_tail12): |
677 | add $12, %eax |
678 | RETURN |
679 | |
680 | L(exit_tail13): |
681 | add $13, %eax |
682 | RETURN |
683 | |
684 | L(exit_tail14): |
685 | add $14, %eax |
686 | RETURN |
687 | |
688 | L(exit_tail15): |
689 | add $15, %eax |
690 | # ifndef USE_AS_STRCAT |
691 | RETURN |
692 | END (STRLEN) |
693 | # endif |
694 | #endif |
695 | |