1/* strlen with SSE2
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
20
21#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
22
23# ifndef USE_AS_STRCAT
24
25# include <sysdep.h>
26# define PARMS 4
27# define STR PARMS
28# define RETURN ret
29
30# ifdef USE_AS_STRNLEN
31# define LEN PARMS + 8
32# define CFI_PUSH(REG) \
33 cfi_adjust_cfa_offset (4); \
34 cfi_rel_offset (REG, 0)
35
36# define CFI_POP(REG) \
37 cfi_adjust_cfa_offset (-4); \
38 cfi_restore (REG)
39
40# define PUSH(REG) pushl REG; CFI_PUSH (REG)
41# define POP(REG) popl REG; CFI_POP (REG)
42# undef RETURN
43# define RETURN POP (%edi); CFI_PUSH(%edi); ret
44# endif
45
46# ifndef STRLEN
47# define STRLEN __strlen_sse2
48# endif
49
50 atom_text_section
51ENTRY (STRLEN)
52 mov STR(%esp), %edx
53# ifdef USE_AS_STRNLEN
54 PUSH (%edi)
55 movl LEN(%esp), %edi
56 sub $4, %edi
57 jbe L(len_less4_prolog)
58# endif
59# endif
60 xor %eax, %eax
61 cmpb $0, (%edx)
62 jz L(exit_tail0)
63 cmpb $0, 1(%edx)
64 jz L(exit_tail1)
65 cmpb $0, 2(%edx)
66 jz L(exit_tail2)
67 cmpb $0, 3(%edx)
68 jz L(exit_tail3)
69
70# ifdef USE_AS_STRNLEN
71 sub $4, %edi
72 jbe L(len_less8_prolog)
73# endif
74
75 cmpb $0, 4(%edx)
76 jz L(exit_tail4)
77 cmpb $0, 5(%edx)
78 jz L(exit_tail5)
79 cmpb $0, 6(%edx)
80 jz L(exit_tail6)
81 cmpb $0, 7(%edx)
82 jz L(exit_tail7)
83
84# ifdef USE_AS_STRNLEN
85 sub $4, %edi
86 jbe L(len_less12_prolog)
87# endif
88
89 cmpb $0, 8(%edx)
90 jz L(exit_tail8)
91 cmpb $0, 9(%edx)
92 jz L(exit_tail9)
93 cmpb $0, 10(%edx)
94 jz L(exit_tail10)
95 cmpb $0, 11(%edx)
96 jz L(exit_tail11)
97
98# ifdef USE_AS_STRNLEN
99 sub $4, %edi
100 jbe L(len_less16_prolog)
101# endif
102
103 cmpb $0, 12(%edx)
104 jz L(exit_tail12)
105 cmpb $0, 13(%edx)
106 jz L(exit_tail13)
107 cmpb $0, 14(%edx)
108 jz L(exit_tail14)
109 cmpb $0, 15(%edx)
110 jz L(exit_tail15)
111
112 pxor %xmm0, %xmm0
113 lea 16(%edx), %eax
114 mov %eax, %ecx
115 and $-16, %eax
116
117# ifdef USE_AS_STRNLEN
118 and $15, %edx
119 add %edx, %edi
120 sub $64, %edi
121 jbe L(len_less64)
122# endif
123
124 pcmpeqb (%eax), %xmm0
125 pmovmskb %xmm0, %edx
126 pxor %xmm1, %xmm1
127 test %edx, %edx
128 lea 16(%eax), %eax
129 jnz L(exit)
130
131 pcmpeqb (%eax), %xmm1
132 pmovmskb %xmm1, %edx
133 pxor %xmm2, %xmm2
134 test %edx, %edx
135 lea 16(%eax), %eax
136 jnz L(exit)
137
138 pcmpeqb (%eax), %xmm2
139 pmovmskb %xmm2, %edx
140 pxor %xmm3, %xmm3
141 test %edx, %edx
142 lea 16(%eax), %eax
143 jnz L(exit)
144
145 pcmpeqb (%eax), %xmm3
146 pmovmskb %xmm3, %edx
147 test %edx, %edx
148 lea 16(%eax), %eax
149 jnz L(exit)
150
151# ifdef USE_AS_STRNLEN
152 sub $64, %edi
153 jbe L(len_less64)
154# endif
155
156 pcmpeqb (%eax), %xmm0
157 pmovmskb %xmm0, %edx
158 test %edx, %edx
159 lea 16(%eax), %eax
160 jnz L(exit)
161
162 pcmpeqb (%eax), %xmm1
163 pmovmskb %xmm1, %edx
164 test %edx, %edx
165 lea 16(%eax), %eax
166 jnz L(exit)
167
168 pcmpeqb (%eax), %xmm2
169 pmovmskb %xmm2, %edx
170 test %edx, %edx
171 lea 16(%eax), %eax
172 jnz L(exit)
173
174 pcmpeqb (%eax), %xmm3
175 pmovmskb %xmm3, %edx
176 test %edx, %edx
177 lea 16(%eax), %eax
178 jnz L(exit)
179
180# ifdef USE_AS_STRNLEN
181 sub $64, %edi
182 jbe L(len_less64)
183# endif
184
185 pcmpeqb (%eax), %xmm0
186 pmovmskb %xmm0, %edx
187 test %edx, %edx
188 lea 16(%eax), %eax
189 jnz L(exit)
190
191 pcmpeqb (%eax), %xmm1
192 pmovmskb %xmm1, %edx
193 test %edx, %edx
194 lea 16(%eax), %eax
195 jnz L(exit)
196
197 pcmpeqb (%eax), %xmm2
198 pmovmskb %xmm2, %edx
199 test %edx, %edx
200 lea 16(%eax), %eax
201 jnz L(exit)
202
203 pcmpeqb (%eax), %xmm3
204 pmovmskb %xmm3, %edx
205 test %edx, %edx
206 lea 16(%eax), %eax
207 jnz L(exit)
208
209# ifdef USE_AS_STRNLEN
210 sub $64, %edi
211 jbe L(len_less64)
212# endif
213
214 pcmpeqb (%eax), %xmm0
215 pmovmskb %xmm0, %edx
216 test %edx, %edx
217 lea 16(%eax), %eax
218 jnz L(exit)
219
220 pcmpeqb (%eax), %xmm1
221 pmovmskb %xmm1, %edx
222 test %edx, %edx
223 lea 16(%eax), %eax
224 jnz L(exit)
225
226 pcmpeqb (%eax), %xmm2
227 pmovmskb %xmm2, %edx
228 test %edx, %edx
229 lea 16(%eax), %eax
230 jnz L(exit)
231
232 pcmpeqb (%eax), %xmm3
233 pmovmskb %xmm3, %edx
234 test %edx, %edx
235 lea 16(%eax), %eax
236 jnz L(exit)
237
238# ifdef USE_AS_STRNLEN
239 mov %eax, %edx
240 and $63, %edx
241 add %edx, %edi
242# endif
243
244 and $-0x40, %eax
245
246 .p2align 4
247L(aligned_64_loop):
248# ifdef USE_AS_STRNLEN
249 sub $64, %edi
250 jbe L(len_less64)
251# endif
252 movaps (%eax), %xmm0
253 movaps 16(%eax), %xmm1
254 movaps 32(%eax), %xmm2
255 movaps 48(%eax), %xmm6
256 pminub %xmm1, %xmm0
257 pminub %xmm6, %xmm2
258 pminub %xmm0, %xmm2
259 pcmpeqb %xmm3, %xmm2
260 pmovmskb %xmm2, %edx
261 test %edx, %edx
262 lea 64(%eax), %eax
263 jz L(aligned_64_loop)
264
265 pcmpeqb -64(%eax), %xmm3
266 pmovmskb %xmm3, %edx
267 test %edx, %edx
268 lea 48(%ecx), %ecx
269 jnz L(exit)
270
271 pcmpeqb %xmm1, %xmm3
272 pmovmskb %xmm3, %edx
273 test %edx, %edx
274 lea -16(%ecx), %ecx
275 jnz L(exit)
276
277 pcmpeqb -32(%eax), %xmm3
278 pmovmskb %xmm3, %edx
279 test %edx, %edx
280 lea -16(%ecx), %ecx
281 jnz L(exit)
282
283 pcmpeqb %xmm6, %xmm3
284 pmovmskb %xmm3, %edx
285 lea -16(%ecx), %ecx
286L(exit):
287 sub %ecx, %eax
288 test %dl, %dl
289 jz L(exit_high)
290
291 mov %dl, %cl
292 and $15, %cl
293 jz L(exit_8)
294 test $0x01, %dl
295 jnz L(exit_tail0)
296 test $0x02, %dl
297 jnz L(exit_tail1)
298 test $0x04, %dl
299 jnz L(exit_tail2)
300 add $3, %eax
301 RETURN
302
303 .p2align 4
304L(exit_8):
305 test $0x10, %dl
306 jnz L(exit_tail4)
307 test $0x20, %dl
308 jnz L(exit_tail5)
309 test $0x40, %dl
310 jnz L(exit_tail6)
311 add $7, %eax
312 RETURN
313
314 .p2align 4
315L(exit_high):
316 mov %dh, %ch
317 and $15, %ch
318 jz L(exit_high_8)
319 test $0x01, %dh
320 jnz L(exit_tail8)
321 test $0x02, %dh
322 jnz L(exit_tail9)
323 test $0x04, %dh
324 jnz L(exit_tail10)
325 add $11, %eax
326 RETURN
327
328 .p2align 4
329L(exit_high_8):
330 test $0x10, %dh
331 jnz L(exit_tail12)
332 test $0x20, %dh
333 jnz L(exit_tail13)
334 test $0x40, %dh
335 jnz L(exit_tail14)
336 add $15, %eax
337L(exit_tail0):
338 RETURN
339
340# ifdef USE_AS_STRNLEN
341
342 .p2align 4
343L(len_less64):
344 pxor %xmm0, %xmm0
345 add $64, %edi
346
347 pcmpeqb (%eax), %xmm0
348 pmovmskb %xmm0, %edx
349 pxor %xmm1, %xmm1
350 lea 16(%eax), %eax
351 test %edx, %edx
352 jnz L(strnlen_exit)
353
354 sub $16, %edi
355 jbe L(return_start_len)
356
357 pcmpeqb (%eax), %xmm1
358 pmovmskb %xmm1, %edx
359 lea 16(%eax), %eax
360 test %edx, %edx
361 jnz L(strnlen_exit)
362
363 sub $16, %edi
364 jbe L(return_start_len)
365
366 pcmpeqb (%eax), %xmm0
367 pmovmskb %xmm0, %edx
368 lea 16(%eax), %eax
369 test %edx, %edx
370 jnz L(strnlen_exit)
371
372 sub $16, %edi
373 jbe L(return_start_len)
374
375 pcmpeqb (%eax), %xmm1
376 pmovmskb %xmm1, %edx
377 lea 16(%eax), %eax
378 test %edx, %edx
379 jnz L(strnlen_exit)
380
381 movl LEN(%esp), %eax
382 RETURN
383
384 .p2align 4
385L(strnlen_exit):
386 sub %ecx, %eax
387
388 test %dl, %dl
389 jz L(strnlen_exit_high)
390 mov %dl, %cl
391 and $15, %cl
392 jz L(strnlen_exit_8)
393 test $0x01, %dl
394 jnz L(exit_tail0)
395 test $0x02, %dl
396 jnz L(strnlen_exit_tail1)
397 test $0x04, %dl
398 jnz L(strnlen_exit_tail2)
399 sub $4, %edi
400 jb L(return_start_len)
401 lea 3(%eax), %eax
402 RETURN
403
404 .p2align 4
405L(strnlen_exit_8):
406 test $0x10, %dl
407 jnz L(strnlen_exit_tail4)
408 test $0x20, %dl
409 jnz L(strnlen_exit_tail5)
410 test $0x40, %dl
411 jnz L(strnlen_exit_tail6)
412 sub $8, %edi
413 jb L(return_start_len)
414 lea 7(%eax), %eax
415 RETURN
416
417 .p2align 4
418L(strnlen_exit_high):
419 mov %dh, %ch
420 and $15, %ch
421 jz L(strnlen_exit_high_8)
422 test $0x01, %dh
423 jnz L(strnlen_exit_tail8)
424 test $0x02, %dh
425 jnz L(strnlen_exit_tail9)
426 test $0x04, %dh
427 jnz L(strnlen_exit_tail10)
428 sub $12, %edi
429 jb L(return_start_len)
430 lea 11(%eax), %eax
431 RETURN
432
433 .p2align 4
434L(strnlen_exit_high_8):
435 test $0x10, %dh
436 jnz L(strnlen_exit_tail12)
437 test $0x20, %dh
438 jnz L(strnlen_exit_tail13)
439 test $0x40, %dh
440 jnz L(strnlen_exit_tail14)
441 sub $16, %edi
442 jb L(return_start_len)
443 lea 15(%eax), %eax
444 RETURN
445
446 .p2align 4
447L(strnlen_exit_tail1):
448 sub $2, %edi
449 jb L(return_start_len)
450 lea 1(%eax), %eax
451 RETURN
452
453 .p2align 4
454L(strnlen_exit_tail2):
455 sub $3, %edi
456 jb L(return_start_len)
457 lea 2(%eax), %eax
458 RETURN
459
460 .p2align 4
461L(strnlen_exit_tail4):
462 sub $5, %edi
463 jb L(return_start_len)
464 lea 4(%eax), %eax
465 RETURN
466
467 .p2align 4
468L(strnlen_exit_tail5):
469 sub $6, %edi
470 jb L(return_start_len)
471 lea 5(%eax), %eax
472 RETURN
473
474 .p2align 4
475L(strnlen_exit_tail6):
476 sub $7, %edi
477 jb L(return_start_len)
478 lea 6(%eax), %eax
479 RETURN
480
481 .p2align 4
482L(strnlen_exit_tail8):
483 sub $9, %edi
484 jb L(return_start_len)
485 lea 8(%eax), %eax
486 RETURN
487
488 .p2align 4
489L(strnlen_exit_tail9):
490 sub $10, %edi
491 jb L(return_start_len)
492 lea 9(%eax), %eax
493 RETURN
494
495 .p2align 4
496L(strnlen_exit_tail10):
497 sub $11, %edi
498 jb L(return_start_len)
499 lea 10(%eax), %eax
500 RETURN
501
502 .p2align 4
503L(strnlen_exit_tail12):
504 sub $13, %edi
505 jb L(return_start_len)
506 lea 12(%eax), %eax
507 RETURN
508
509 .p2align 4
510L(strnlen_exit_tail13):
511 sub $14, %edi
512 jb L(return_start_len)
513 lea 13(%eax), %eax
514 RETURN
515
516 .p2align 4
517L(strnlen_exit_tail14):
518 sub $15, %edi
519 jb L(return_start_len)
520 lea 14(%eax), %eax
521 RETURN
522
523 .p2align 4
524L(return_start_len):
525 movl LEN(%esp), %eax
526 RETURN
527
528/* for prolog only */
529
530 .p2align 4
531L(len_less4_prolog):
532 xor %eax, %eax
533
534 add $4, %edi
535 jz L(exit_tail0)
536
537 cmpb $0, (%edx)
538 jz L(exit_tail0)
539 cmp $1, %edi
540 je L(exit_tail1)
541
542 cmpb $0, 1(%edx)
543 jz L(exit_tail1)
544 cmp $2, %edi
545 je L(exit_tail2)
546
547 cmpb $0, 2(%edx)
548 jz L(exit_tail2)
549 cmp $3, %edi
550 je L(exit_tail3)
551
552 cmpb $0, 3(%edx)
553 jz L(exit_tail3)
554 mov $4, %eax
555 RETURN
556
557 .p2align 4
558L(len_less8_prolog):
559 add $4, %edi
560
561 cmpb $0, 4(%edx)
562 jz L(exit_tail4)
563 cmp $1, %edi
564 je L(exit_tail5)
565
566 cmpb $0, 5(%edx)
567 jz L(exit_tail5)
568 cmp $2, %edi
569 je L(exit_tail6)
570
571 cmpb $0, 6(%edx)
572 jz L(exit_tail6)
573 cmp $3, %edi
574 je L(exit_tail7)
575
576 cmpb $0, 7(%edx)
577 jz L(exit_tail7)
578 mov $8, %eax
579 RETURN
580
581
582 .p2align 4
583L(len_less12_prolog):
584 add $4, %edi
585
586 cmpb $0, 8(%edx)
587 jz L(exit_tail8)
588 cmp $1, %edi
589 je L(exit_tail9)
590
591 cmpb $0, 9(%edx)
592 jz L(exit_tail9)
593 cmp $2, %edi
594 je L(exit_tail10)
595
596 cmpb $0, 10(%edx)
597 jz L(exit_tail10)
598 cmp $3, %edi
599 je L(exit_tail11)
600
601 cmpb $0, 11(%edx)
602 jz L(exit_tail11)
603 mov $12, %eax
604 RETURN
605
606 .p2align 4
607L(len_less16_prolog):
608 add $4, %edi
609
610 cmpb $0, 12(%edx)
611 jz L(exit_tail12)
612 cmp $1, %edi
613 je L(exit_tail13)
614
615 cmpb $0, 13(%edx)
616 jz L(exit_tail13)
617 cmp $2, %edi
618 je L(exit_tail14)
619
620 cmpb $0, 14(%edx)
621 jz L(exit_tail14)
622 cmp $3, %edi
623 je L(exit_tail15)
624
625 cmpb $0, 15(%edx)
626 jz L(exit_tail15)
627 mov $16, %eax
628 RETURN
629# endif
630
631 .p2align 4
632L(exit_tail1):
633 add $1, %eax
634 RETURN
635
636L(exit_tail2):
637 add $2, %eax
638 RETURN
639
640L(exit_tail3):
641 add $3, %eax
642 RETURN
643
644L(exit_tail4):
645 add $4, %eax
646 RETURN
647
648L(exit_tail5):
649 add $5, %eax
650 RETURN
651
652L(exit_tail6):
653 add $6, %eax
654 RETURN
655
656L(exit_tail7):
657 add $7, %eax
658 RETURN
659
660L(exit_tail8):
661 add $8, %eax
662 RETURN
663
664L(exit_tail9):
665 add $9, %eax
666 RETURN
667
668L(exit_tail10):
669 add $10, %eax
670 RETURN
671
672L(exit_tail11):
673 add $11, %eax
674 RETURN
675
676L(exit_tail12):
677 add $12, %eax
678 RETURN
679
680L(exit_tail13):
681 add $13, %eax
682 RETURN
683
684L(exit_tail14):
685 add $14, %eax
686 RETURN
687
688L(exit_tail15):
689 add $15, %eax
690# ifndef USE_AS_STRCAT
691 RETURN
692END (STRLEN)
693# endif
694#endif
695

source code of glibc/sysdeps/i386/i686/multiarch/strlen-sse2.S