1/* memcmp with SSSE3, wmemcmp with SSSE3
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCMP
24# define MEMCMP __memcmp_ssse3
25# endif
26
27# define CFI_PUSH(REG) \
28 cfi_adjust_cfa_offset (4); \
29 cfi_rel_offset (REG, 0)
30
31# define CFI_POP(REG) \
32 cfi_adjust_cfa_offset (-4); \
33 cfi_restore (REG)
34
35# define PUSH(REG) pushl REG; CFI_PUSH (REG)
36# define POP(REG) popl REG; CFI_POP (REG)
37
38# define PARMS 4
39# define BLK1 PARMS
40# define BLK2 BLK1+4
41# define LEN BLK2+4
42# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
43# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
44
45/* Warning!
46 wmemcmp has to use SIGNED comparison for elements.
47 memcmp has to use UNSIGNED comparison for elements.
48*/
49
50 atom_text_section
51ENTRY (MEMCMP)
52 movl LEN(%esp), %ecx
53
54# ifdef USE_AS_WMEMCMP
55 shl $2, %ecx
56 test %ecx, %ecx
57 jz L(zero)
58# endif
59
60 movl BLK1(%esp), %eax
61 cmp $48, %ecx
62 movl BLK2(%esp), %edx
63 jae L(48bytesormore)
64
65# ifndef USE_AS_WMEMCMP
66 cmp $1, %ecx
67 jbe L(less1bytes)
68# endif
69
70 PUSH (%ebx)
71 add %ecx, %edx
72 add %ecx, %eax
73 jmp L(less48bytes)
74
75 CFI_POP (%ebx)
76
77# ifndef USE_AS_WMEMCMP
78 .p2align 4
79L(less1bytes):
80 jb L(zero)
81 movb (%eax), %cl
82 cmp (%edx), %cl
83 je L(zero)
84 mov $1, %eax
85 ja L(1bytesend)
86 neg %eax
87L(1bytesend):
88 ret
89# endif
90
91 .p2align 4
92L(zero):
93 xor %eax, %eax
94 ret
95
96 .p2align 4
97L(48bytesormore):
98 PUSH (%ebx)
99 PUSH (%esi)
100 PUSH (%edi)
101 cfi_remember_state
102 movdqu (%eax), %xmm3
103 movdqu (%edx), %xmm0
104 movl %eax, %edi
105 movl %edx, %esi
106 pcmpeqb %xmm0, %xmm3
107 pmovmskb %xmm3, %edx
108 lea 16(%edi), %edi
109
110 sub $0xffff, %edx
111 lea 16(%esi), %esi
112 jnz L(less16bytes)
113 mov %edi, %edx
114 and $0xf, %edx
115 xor %edx, %edi
116 sub %edx, %esi
117 add %edx, %ecx
118 mov %esi, %edx
119 and $0xf, %edx
120 jz L(shr_0)
121 xor %edx, %esi
122
123# ifndef USE_AS_WMEMCMP
124 cmp $8, %edx
125 jae L(next_unaligned_table)
126 cmp $0, %edx
127 je L(shr_0)
128 cmp $1, %edx
129 je L(shr_1)
130 cmp $2, %edx
131 je L(shr_2)
132 cmp $3, %edx
133 je L(shr_3)
134 cmp $4, %edx
135 je L(shr_4)
136 cmp $5, %edx
137 je L(shr_5)
138 cmp $6, %edx
139 je L(shr_6)
140 jmp L(shr_7)
141
142 .p2align 2
143L(next_unaligned_table):
144 cmp $8, %edx
145 je L(shr_8)
146 cmp $9, %edx
147 je L(shr_9)
148 cmp $10, %edx
149 je L(shr_10)
150 cmp $11, %edx
151 je L(shr_11)
152 cmp $12, %edx
153 je L(shr_12)
154 cmp $13, %edx
155 je L(shr_13)
156 cmp $14, %edx
157 je L(shr_14)
158 jmp L(shr_15)
159# else
160 cmp $0, %edx
161 je L(shr_0)
162 cmp $4, %edx
163 je L(shr_4)
164 cmp $8, %edx
165 je L(shr_8)
166 jmp L(shr_12)
167# endif
168
169 .p2align 4
170L(shr_0):
171 cmp $80, %ecx
172 jae L(shr_0_gobble)
173 lea -48(%ecx), %ecx
174 xor %eax, %eax
175 movaps (%esi), %xmm1
176 pcmpeqb (%edi), %xmm1
177 movaps 16(%esi), %xmm2
178 pcmpeqb 16(%edi), %xmm2
179 pand %xmm1, %xmm2
180 pmovmskb %xmm2, %edx
181 add $32, %edi
182 add $32, %esi
183 sub $0xffff, %edx
184 jnz L(exit)
185
186 lea (%ecx, %edi,1), %eax
187 lea (%ecx, %esi,1), %edx
188 POP (%edi)
189 POP (%esi)
190 jmp L(less48bytes)
191
192 cfi_restore_state
193 cfi_remember_state
194 .p2align 4
195L(shr_0_gobble):
196 lea -48(%ecx), %ecx
197 movdqa (%esi), %xmm0
198 xor %eax, %eax
199 pcmpeqb (%edi), %xmm0
200 sub $32, %ecx
201 movdqa 16(%esi), %xmm2
202 pcmpeqb 16(%edi), %xmm2
203L(shr_0_gobble_loop):
204 pand %xmm0, %xmm2
205 sub $32, %ecx
206 pmovmskb %xmm2, %edx
207 movdqa %xmm0, %xmm1
208 movdqa 32(%esi), %xmm0
209 movdqa 48(%esi), %xmm2
210 sbb $0xffff, %edx
211 pcmpeqb 32(%edi), %xmm0
212 pcmpeqb 48(%edi), %xmm2
213 lea 32(%edi), %edi
214 lea 32(%esi), %esi
215 jz L(shr_0_gobble_loop)
216
217 pand %xmm0, %xmm2
218 cmp $0, %ecx
219 jge L(shr_0_gobble_loop_next)
220 inc %edx
221 add $32, %ecx
222L(shr_0_gobble_loop_next):
223 test %edx, %edx
224 jnz L(exit)
225
226 pmovmskb %xmm2, %edx
227 movdqa %xmm0, %xmm1
228 lea 32(%edi), %edi
229 lea 32(%esi), %esi
230 sub $0xffff, %edx
231 jnz L(exit)
232 lea (%ecx, %edi,1), %eax
233 lea (%ecx, %esi,1), %edx
234 POP (%edi)
235 POP (%esi)
236 jmp L(less48bytes)
237
238# ifndef USE_AS_WMEMCMP
239 cfi_restore_state
240 cfi_remember_state
241 .p2align 4
242L(shr_1):
243 cmp $80, %ecx
244 lea -48(%ecx), %ecx
245 mov %edx, %eax
246 jae L(shr_1_gobble)
247
248 movdqa 16(%esi), %xmm1
249 movdqa %xmm1, %xmm2
250 palignr $1,(%esi), %xmm1
251 pcmpeqb (%edi), %xmm1
252
253 movdqa 32(%esi), %xmm3
254 palignr $1,%xmm2, %xmm3
255 pcmpeqb 16(%edi), %xmm3
256
257 pand %xmm1, %xmm3
258 pmovmskb %xmm3, %edx
259 lea 32(%edi), %edi
260 lea 32(%esi), %esi
261 sub $0xffff, %edx
262 jnz L(exit)
263 lea (%ecx, %edi,1), %eax
264 lea 1(%ecx, %esi,1), %edx
265 POP (%edi)
266 POP (%esi)
267 jmp L(less48bytes)
268
269 cfi_restore_state
270 cfi_remember_state
271 .p2align 4
272L(shr_1_gobble):
273 sub $32, %ecx
274 movdqa 16(%esi), %xmm0
275 palignr $1,(%esi), %xmm0
276 pcmpeqb (%edi), %xmm0
277
278 movdqa 32(%esi), %xmm3
279 palignr $1,16(%esi), %xmm3
280 pcmpeqb 16(%edi), %xmm3
281
282L(shr_1_gobble_loop):
283 pand %xmm0, %xmm3
284 sub $32, %ecx
285 pmovmskb %xmm3, %edx
286 movdqa %xmm0, %xmm1
287
288 movdqa 64(%esi), %xmm3
289 palignr $1,48(%esi), %xmm3
290 sbb $0xffff, %edx
291 movdqa 48(%esi), %xmm0
292 palignr $1,32(%esi), %xmm0
293 pcmpeqb 32(%edi), %xmm0
294 lea 32(%esi), %esi
295 pcmpeqb 48(%edi), %xmm3
296
297 lea 32(%edi), %edi
298 jz L(shr_1_gobble_loop)
299 pand %xmm0, %xmm3
300
301 cmp $0, %ecx
302 jge L(shr_1_gobble_next)
303 inc %edx
304 add $32, %ecx
305L(shr_1_gobble_next):
306 test %edx, %edx
307 jnz L(exit)
308
309 pmovmskb %xmm3, %edx
310 movdqa %xmm0, %xmm1
311 lea 32(%edi), %edi
312 lea 32(%esi), %esi
313 sub $0xffff, %edx
314 jnz L(exit)
315
316 lea (%ecx, %edi,1), %eax
317 lea 1(%ecx, %esi,1), %edx
318 POP (%edi)
319 POP (%esi)
320 jmp L(less48bytes)
321
322
323 cfi_restore_state
324 cfi_remember_state
325 .p2align 4
326L(shr_2):
327 cmp $80, %ecx
328 lea -48(%ecx), %ecx
329 mov %edx, %eax
330 jae L(shr_2_gobble)
331
332 movdqa 16(%esi), %xmm1
333 movdqa %xmm1, %xmm2
334 palignr $2,(%esi), %xmm1
335 pcmpeqb (%edi), %xmm1
336
337 movdqa 32(%esi), %xmm3
338 palignr $2,%xmm2, %xmm3
339 pcmpeqb 16(%edi), %xmm3
340
341 pand %xmm1, %xmm3
342 pmovmskb %xmm3, %edx
343 lea 32(%edi), %edi
344 lea 32(%esi), %esi
345 sub $0xffff, %edx
346 jnz L(exit)
347 lea (%ecx, %edi,1), %eax
348 lea 2(%ecx, %esi,1), %edx
349 POP (%edi)
350 POP (%esi)
351 jmp L(less48bytes)
352
353 cfi_restore_state
354 cfi_remember_state
355 .p2align 4
356L(shr_2_gobble):
357 sub $32, %ecx
358 movdqa 16(%esi), %xmm0
359 palignr $2,(%esi), %xmm0
360 pcmpeqb (%edi), %xmm0
361
362 movdqa 32(%esi), %xmm3
363 palignr $2,16(%esi), %xmm3
364 pcmpeqb 16(%edi), %xmm3
365
366L(shr_2_gobble_loop):
367 pand %xmm0, %xmm3
368 sub $32, %ecx
369 pmovmskb %xmm3, %edx
370 movdqa %xmm0, %xmm1
371
372 movdqa 64(%esi), %xmm3
373 palignr $2,48(%esi), %xmm3
374 sbb $0xffff, %edx
375 movdqa 48(%esi), %xmm0
376 palignr $2,32(%esi), %xmm0
377 pcmpeqb 32(%edi), %xmm0
378 lea 32(%esi), %esi
379 pcmpeqb 48(%edi), %xmm3
380
381 lea 32(%edi), %edi
382 jz L(shr_2_gobble_loop)
383 pand %xmm0, %xmm3
384
385 cmp $0, %ecx
386 jge L(shr_2_gobble_next)
387 inc %edx
388 add $32, %ecx
389L(shr_2_gobble_next):
390 test %edx, %edx
391 jnz L(exit)
392
393 pmovmskb %xmm3, %edx
394 movdqa %xmm0, %xmm1
395 lea 32(%edi), %edi
396 lea 32(%esi), %esi
397 sub $0xffff, %edx
398 jnz L(exit)
399
400 lea (%ecx, %edi,1), %eax
401 lea 2(%ecx, %esi,1), %edx
402 POP (%edi)
403 POP (%esi)
404 jmp L(less48bytes)
405
406 cfi_restore_state
407 cfi_remember_state
408 .p2align 4
409L(shr_3):
410 cmp $80, %ecx
411 lea -48(%ecx), %ecx
412 mov %edx, %eax
413 jae L(shr_3_gobble)
414
415 movdqa 16(%esi), %xmm1
416 movdqa %xmm1, %xmm2
417 palignr $3,(%esi), %xmm1
418 pcmpeqb (%edi), %xmm1
419
420 movdqa 32(%esi), %xmm3
421 palignr $3,%xmm2, %xmm3
422 pcmpeqb 16(%edi), %xmm3
423
424 pand %xmm1, %xmm3
425 pmovmskb %xmm3, %edx
426 lea 32(%edi), %edi
427 lea 32(%esi), %esi
428 sub $0xffff, %edx
429 jnz L(exit)
430 lea (%ecx, %edi,1), %eax
431 lea 3(%ecx, %esi,1), %edx
432 POP (%edi)
433 POP (%esi)
434 jmp L(less48bytes)
435
436 cfi_restore_state
437 cfi_remember_state
438 .p2align 4
439L(shr_3_gobble):
440 sub $32, %ecx
441 movdqa 16(%esi), %xmm0
442 palignr $3,(%esi), %xmm0
443 pcmpeqb (%edi), %xmm0
444
445 movdqa 32(%esi), %xmm3
446 palignr $3,16(%esi), %xmm3
447 pcmpeqb 16(%edi), %xmm3
448
449L(shr_3_gobble_loop):
450 pand %xmm0, %xmm3
451 sub $32, %ecx
452 pmovmskb %xmm3, %edx
453 movdqa %xmm0, %xmm1
454
455 movdqa 64(%esi), %xmm3
456 palignr $3,48(%esi), %xmm3
457 sbb $0xffff, %edx
458 movdqa 48(%esi), %xmm0
459 palignr $3,32(%esi), %xmm0
460 pcmpeqb 32(%edi), %xmm0
461 lea 32(%esi), %esi
462 pcmpeqb 48(%edi), %xmm3
463
464 lea 32(%edi), %edi
465 jz L(shr_3_gobble_loop)
466 pand %xmm0, %xmm3
467
468 cmp $0, %ecx
469 jge L(shr_3_gobble_next)
470 inc %edx
471 add $32, %ecx
472L(shr_3_gobble_next):
473 test %edx, %edx
474 jnz L(exit)
475
476 pmovmskb %xmm3, %edx
477 movdqa %xmm0, %xmm1
478 lea 32(%edi), %edi
479 lea 32(%esi), %esi
480 sub $0xffff, %edx
481 jnz L(exit)
482
483 lea (%ecx, %edi,1), %eax
484 lea 3(%ecx, %esi,1), %edx
485 POP (%edi)
486 POP (%esi)
487 jmp L(less48bytes)
488# endif
489
490 cfi_restore_state
491 cfi_remember_state
492 .p2align 4
493L(shr_4):
494 cmp $80, %ecx
495 lea -48(%ecx), %ecx
496 mov %edx, %eax
497 jae L(shr_4_gobble)
498
499 movdqa 16(%esi), %xmm1
500 movdqa %xmm1, %xmm2
501 palignr $4,(%esi), %xmm1
502 pcmpeqb (%edi), %xmm1
503
504 movdqa 32(%esi), %xmm3
505 palignr $4,%xmm2, %xmm3
506 pcmpeqb 16(%edi), %xmm3
507
508 pand %xmm1, %xmm3
509 pmovmskb %xmm3, %edx
510 lea 32(%edi), %edi
511 lea 32(%esi), %esi
512 sub $0xffff, %edx
513 jnz L(exit)
514 lea (%ecx, %edi,1), %eax
515 lea 4(%ecx, %esi,1), %edx
516 POP (%edi)
517 POP (%esi)
518 jmp L(less48bytes)
519
520 cfi_restore_state
521 cfi_remember_state
522 .p2align 4
523L(shr_4_gobble):
524 sub $32, %ecx
525 movdqa 16(%esi), %xmm0
526 palignr $4,(%esi), %xmm0
527 pcmpeqb (%edi), %xmm0
528
529 movdqa 32(%esi), %xmm3
530 palignr $4,16(%esi), %xmm3
531 pcmpeqb 16(%edi), %xmm3
532
533L(shr_4_gobble_loop):
534 pand %xmm0, %xmm3
535 sub $32, %ecx
536 pmovmskb %xmm3, %edx
537 movdqa %xmm0, %xmm1
538
539 movdqa 64(%esi), %xmm3
540 palignr $4,48(%esi), %xmm3
541 sbb $0xffff, %edx
542 movdqa 48(%esi), %xmm0
543 palignr $4,32(%esi), %xmm0
544 pcmpeqb 32(%edi), %xmm0
545 lea 32(%esi), %esi
546 pcmpeqb 48(%edi), %xmm3
547
548 lea 32(%edi), %edi
549 jz L(shr_4_gobble_loop)
550 pand %xmm0, %xmm3
551
552 cmp $0, %ecx
553 jge L(shr_4_gobble_next)
554 inc %edx
555 add $32, %ecx
556L(shr_4_gobble_next):
557 test %edx, %edx
558 jnz L(exit)
559
560 pmovmskb %xmm3, %edx
561 movdqa %xmm0, %xmm1
562 lea 32(%edi), %edi
563 lea 32(%esi), %esi
564 sub $0xffff, %edx
565 jnz L(exit)
566
567 lea (%ecx, %edi,1), %eax
568 lea 4(%ecx, %esi,1), %edx
569 POP (%edi)
570 POP (%esi)
571 jmp L(less48bytes)
572
573# ifndef USE_AS_WMEMCMP
574 cfi_restore_state
575 cfi_remember_state
576 .p2align 4
577L(shr_5):
578 cmp $80, %ecx
579 lea -48(%ecx), %ecx
580 mov %edx, %eax
581 jae L(shr_5_gobble)
582
583 movdqa 16(%esi), %xmm1
584 movdqa %xmm1, %xmm2
585 palignr $5,(%esi), %xmm1
586 pcmpeqb (%edi), %xmm1
587
588 movdqa 32(%esi), %xmm3
589 palignr $5,%xmm2, %xmm3
590 pcmpeqb 16(%edi), %xmm3
591
592 pand %xmm1, %xmm3
593 pmovmskb %xmm3, %edx
594 lea 32(%edi), %edi
595 lea 32(%esi), %esi
596 sub $0xffff, %edx
597 jnz L(exit)
598 lea (%ecx, %edi,1), %eax
599 lea 5(%ecx, %esi,1), %edx
600 POP (%edi)
601 POP (%esi)
602 jmp L(less48bytes)
603
604 cfi_restore_state
605 cfi_remember_state
606 .p2align 4
607L(shr_5_gobble):
608 sub $32, %ecx
609 movdqa 16(%esi), %xmm0
610 palignr $5,(%esi), %xmm0
611 pcmpeqb (%edi), %xmm0
612
613 movdqa 32(%esi), %xmm3
614 palignr $5,16(%esi), %xmm3
615 pcmpeqb 16(%edi), %xmm3
616
617L(shr_5_gobble_loop):
618 pand %xmm0, %xmm3
619 sub $32, %ecx
620 pmovmskb %xmm3, %edx
621 movdqa %xmm0, %xmm1
622
623 movdqa 64(%esi), %xmm3
624 palignr $5,48(%esi), %xmm3
625 sbb $0xffff, %edx
626 movdqa 48(%esi), %xmm0
627 palignr $5,32(%esi), %xmm0
628 pcmpeqb 32(%edi), %xmm0
629 lea 32(%esi), %esi
630 pcmpeqb 48(%edi), %xmm3
631
632 lea 32(%edi), %edi
633 jz L(shr_5_gobble_loop)
634 pand %xmm0, %xmm3
635
636 cmp $0, %ecx
637 jge L(shr_5_gobble_next)
638 inc %edx
639 add $32, %ecx
640L(shr_5_gobble_next):
641 test %edx, %edx
642 jnz L(exit)
643
644 pmovmskb %xmm3, %edx
645 movdqa %xmm0, %xmm1
646 lea 32(%edi), %edi
647 lea 32(%esi), %esi
648 sub $0xffff, %edx
649 jnz L(exit)
650
651 lea (%ecx, %edi,1), %eax
652 lea 5(%ecx, %esi,1), %edx
653 POP (%edi)
654 POP (%esi)
655 jmp L(less48bytes)
656
657 cfi_restore_state
658 cfi_remember_state
659 .p2align 4
660L(shr_6):
661 cmp $80, %ecx
662 lea -48(%ecx), %ecx
663 mov %edx, %eax
664 jae L(shr_6_gobble)
665
666 movdqa 16(%esi), %xmm1
667 movdqa %xmm1, %xmm2
668 palignr $6,(%esi), %xmm1
669 pcmpeqb (%edi), %xmm1
670
671 movdqa 32(%esi), %xmm3
672 palignr $6,%xmm2, %xmm3
673 pcmpeqb 16(%edi), %xmm3
674
675 pand %xmm1, %xmm3
676 pmovmskb %xmm3, %edx
677 lea 32(%edi), %edi
678 lea 32(%esi), %esi
679 sub $0xffff, %edx
680 jnz L(exit)
681 lea (%ecx, %edi,1), %eax
682 lea 6(%ecx, %esi,1), %edx
683 POP (%edi)
684 POP (%esi)
685 jmp L(less48bytes)
686
687 cfi_restore_state
688 cfi_remember_state
689 .p2align 4
690L(shr_6_gobble):
691 sub $32, %ecx
692 movdqa 16(%esi), %xmm0
693 palignr $6,(%esi), %xmm0
694 pcmpeqb (%edi), %xmm0
695
696 movdqa 32(%esi), %xmm3
697 palignr $6,16(%esi), %xmm3
698 pcmpeqb 16(%edi), %xmm3
699
700L(shr_6_gobble_loop):
701 pand %xmm0, %xmm3
702 sub $32, %ecx
703 pmovmskb %xmm3, %edx
704 movdqa %xmm0, %xmm1
705
706 movdqa 64(%esi), %xmm3
707 palignr $6,48(%esi), %xmm3
708 sbb $0xffff, %edx
709 movdqa 48(%esi), %xmm0
710 palignr $6,32(%esi), %xmm0
711 pcmpeqb 32(%edi), %xmm0
712 lea 32(%esi), %esi
713 pcmpeqb 48(%edi), %xmm3
714
715 lea 32(%edi), %edi
716 jz L(shr_6_gobble_loop)
717 pand %xmm0, %xmm3
718
719 cmp $0, %ecx
720 jge L(shr_6_gobble_next)
721 inc %edx
722 add $32, %ecx
723L(shr_6_gobble_next):
724 test %edx, %edx
725 jnz L(exit)
726
727 pmovmskb %xmm3, %edx
728 movdqa %xmm0, %xmm1
729 lea 32(%edi), %edi
730 lea 32(%esi), %esi
731 sub $0xffff, %edx
732 jnz L(exit)
733
734 lea (%ecx, %edi,1), %eax
735 lea 6(%ecx, %esi,1), %edx
736 POP (%edi)
737 POP (%esi)
738 jmp L(less48bytes)
739
740 cfi_restore_state
741 cfi_remember_state
742 .p2align 4
743L(shr_7):
744 cmp $80, %ecx
745 lea -48(%ecx), %ecx
746 mov %edx, %eax
747 jae L(shr_7_gobble)
748
749 movdqa 16(%esi), %xmm1
750 movdqa %xmm1, %xmm2
751 palignr $7,(%esi), %xmm1
752 pcmpeqb (%edi), %xmm1
753
754 movdqa 32(%esi), %xmm3
755 palignr $7,%xmm2, %xmm3
756 pcmpeqb 16(%edi), %xmm3
757
758 pand %xmm1, %xmm3
759 pmovmskb %xmm3, %edx
760 lea 32(%edi), %edi
761 lea 32(%esi), %esi
762 sub $0xffff, %edx
763 jnz L(exit)
764 lea (%ecx, %edi,1), %eax
765 lea 7(%ecx, %esi,1), %edx
766 POP (%edi)
767 POP (%esi)
768 jmp L(less48bytes)
769
770 cfi_restore_state
771 cfi_remember_state
772 .p2align 4
773L(shr_7_gobble):
774 sub $32, %ecx
775 movdqa 16(%esi), %xmm0
776 palignr $7,(%esi), %xmm0
777 pcmpeqb (%edi), %xmm0
778
779 movdqa 32(%esi), %xmm3
780 palignr $7,16(%esi), %xmm3
781 pcmpeqb 16(%edi), %xmm3
782
783L(shr_7_gobble_loop):
784 pand %xmm0, %xmm3
785 sub $32, %ecx
786 pmovmskb %xmm3, %edx
787 movdqa %xmm0, %xmm1
788
789 movdqa 64(%esi), %xmm3
790 palignr $7,48(%esi), %xmm3
791 sbb $0xffff, %edx
792 movdqa 48(%esi), %xmm0
793 palignr $7,32(%esi), %xmm0
794 pcmpeqb 32(%edi), %xmm0
795 lea 32(%esi), %esi
796 pcmpeqb 48(%edi), %xmm3
797
798 lea 32(%edi), %edi
799 jz L(shr_7_gobble_loop)
800 pand %xmm0, %xmm3
801
802 cmp $0, %ecx
803 jge L(shr_7_gobble_next)
804 inc %edx
805 add $32, %ecx
806L(shr_7_gobble_next):
807 test %edx, %edx
808 jnz L(exit)
809
810 pmovmskb %xmm3, %edx
811 movdqa %xmm0, %xmm1
812 lea 32(%edi), %edi
813 lea 32(%esi), %esi
814 sub $0xffff, %edx
815 jnz L(exit)
816
817 lea (%ecx, %edi,1), %eax
818 lea 7(%ecx, %esi,1), %edx
819 POP (%edi)
820 POP (%esi)
821 jmp L(less48bytes)
822# endif
823
824 cfi_restore_state
825 cfi_remember_state
826 .p2align 4
827L(shr_8):
828 cmp $80, %ecx
829 lea -48(%ecx), %ecx
830 mov %edx, %eax
831 jae L(shr_8_gobble)
832
833 movdqa 16(%esi), %xmm1
834 movdqa %xmm1, %xmm2
835 palignr $8,(%esi), %xmm1
836 pcmpeqb (%edi), %xmm1
837
838 movdqa 32(%esi), %xmm3
839 palignr $8,%xmm2, %xmm3
840 pcmpeqb 16(%edi), %xmm3
841
842 pand %xmm1, %xmm3
843 pmovmskb %xmm3, %edx
844 lea 32(%edi), %edi
845 lea 32(%esi), %esi
846 sub $0xffff, %edx
847 jnz L(exit)
848 lea (%ecx, %edi,1), %eax
849 lea 8(%ecx, %esi,1), %edx
850 POP (%edi)
851 POP (%esi)
852 jmp L(less48bytes)
853
854 cfi_restore_state
855 cfi_remember_state
856 .p2align 4
857L(shr_8_gobble):
858 sub $32, %ecx
859 movdqa 16(%esi), %xmm0
860 palignr $8,(%esi), %xmm0
861 pcmpeqb (%edi), %xmm0
862
863 movdqa 32(%esi), %xmm3
864 palignr $8,16(%esi), %xmm3
865 pcmpeqb 16(%edi), %xmm3
866
867L(shr_8_gobble_loop):
868 pand %xmm0, %xmm3
869 sub $32, %ecx
870 pmovmskb %xmm3, %edx
871 movdqa %xmm0, %xmm1
872
873 movdqa 64(%esi), %xmm3
874 palignr $8,48(%esi), %xmm3
875 sbb $0xffff, %edx
876 movdqa 48(%esi), %xmm0
877 palignr $8,32(%esi), %xmm0
878 pcmpeqb 32(%edi), %xmm0
879 lea 32(%esi), %esi
880 pcmpeqb 48(%edi), %xmm3
881
882 lea 32(%edi), %edi
883 jz L(shr_8_gobble_loop)
884 pand %xmm0, %xmm3
885
886 cmp $0, %ecx
887 jge L(shr_8_gobble_next)
888 inc %edx
889 add $32, %ecx
890L(shr_8_gobble_next):
891 test %edx, %edx
892 jnz L(exit)
893
894 pmovmskb %xmm3, %edx
895 movdqa %xmm0, %xmm1
896 lea 32(%edi), %edi
897 lea 32(%esi), %esi
898 sub $0xffff, %edx
899 jnz L(exit)
900
901 lea (%ecx, %edi,1), %eax
902 lea 8(%ecx, %esi,1), %edx
903 POP (%edi)
904 POP (%esi)
905 jmp L(less48bytes)
906
907# ifndef USE_AS_WMEMCMP
908 cfi_restore_state
909 cfi_remember_state
910 .p2align 4
911L(shr_9):
912 cmp $80, %ecx
913 lea -48(%ecx), %ecx
914 mov %edx, %eax
915 jae L(shr_9_gobble)
916
917 movdqa 16(%esi), %xmm1
918 movdqa %xmm1, %xmm2
919 palignr $9,(%esi), %xmm1
920 pcmpeqb (%edi), %xmm1
921
922 movdqa 32(%esi), %xmm3
923 palignr $9,%xmm2, %xmm3
924 pcmpeqb 16(%edi), %xmm3
925
926 pand %xmm1, %xmm3
927 pmovmskb %xmm3, %edx
928 lea 32(%edi), %edi
929 lea 32(%esi), %esi
930 sub $0xffff, %edx
931 jnz L(exit)
932 lea (%ecx, %edi,1), %eax
933 lea 9(%ecx, %esi,1), %edx
934 POP (%edi)
935 POP (%esi)
936 jmp L(less48bytes)
937
938 cfi_restore_state
939 cfi_remember_state
940 .p2align 4
941L(shr_9_gobble):
942 sub $32, %ecx
943 movdqa 16(%esi), %xmm0
944 palignr $9,(%esi), %xmm0
945 pcmpeqb (%edi), %xmm0
946
947 movdqa 32(%esi), %xmm3
948 palignr $9,16(%esi), %xmm3
949 pcmpeqb 16(%edi), %xmm3
950
951L(shr_9_gobble_loop):
952 pand %xmm0, %xmm3
953 sub $32, %ecx
954 pmovmskb %xmm3, %edx
955 movdqa %xmm0, %xmm1
956
957 movdqa 64(%esi), %xmm3
958 palignr $9,48(%esi), %xmm3
959 sbb $0xffff, %edx
960 movdqa 48(%esi), %xmm0
961 palignr $9,32(%esi), %xmm0
962 pcmpeqb 32(%edi), %xmm0
963 lea 32(%esi), %esi
964 pcmpeqb 48(%edi), %xmm3
965
966 lea 32(%edi), %edi
967 jz L(shr_9_gobble_loop)
968 pand %xmm0, %xmm3
969
970 cmp $0, %ecx
971 jge L(shr_9_gobble_next)
972 inc %edx
973 add $32, %ecx
974L(shr_9_gobble_next):
975 test %edx, %edx
976 jnz L(exit)
977
978 pmovmskb %xmm3, %edx
979 movdqa %xmm0, %xmm1
980 lea 32(%edi), %edi
981 lea 32(%esi), %esi
982 sub $0xffff, %edx
983 jnz L(exit)
984
985 lea (%ecx, %edi,1), %eax
986 lea 9(%ecx, %esi,1), %edx
987 POP (%edi)
988 POP (%esi)
989 jmp L(less48bytes)
990
991 cfi_restore_state
992 cfi_remember_state
993 .p2align 4
994L(shr_10):
995 cmp $80, %ecx
996 lea -48(%ecx), %ecx
997 mov %edx, %eax
998 jae L(shr_10_gobble)
999
1000 movdqa 16(%esi), %xmm1
1001 movdqa %xmm1, %xmm2
1002 palignr $10, (%esi), %xmm1
1003 pcmpeqb (%edi), %xmm1
1004
1005 movdqa 32(%esi), %xmm3
1006 palignr $10,%xmm2, %xmm3
1007 pcmpeqb 16(%edi), %xmm3
1008
1009 pand %xmm1, %xmm3
1010 pmovmskb %xmm3, %edx
1011 lea 32(%edi), %edi
1012 lea 32(%esi), %esi
1013 sub $0xffff, %edx
1014 jnz L(exit)
1015 lea (%ecx, %edi,1), %eax
1016 lea 10(%ecx, %esi,1), %edx
1017 POP (%edi)
1018 POP (%esi)
1019 jmp L(less48bytes)
1020
1021 cfi_restore_state
1022 cfi_remember_state
1023 .p2align 4
1024L(shr_10_gobble):
1025 sub $32, %ecx
1026 movdqa 16(%esi), %xmm0
1027 palignr $10, (%esi), %xmm0
1028 pcmpeqb (%edi), %xmm0
1029
1030 movdqa 32(%esi), %xmm3
1031 palignr $10, 16(%esi), %xmm3
1032 pcmpeqb 16(%edi), %xmm3
1033
1034L(shr_10_gobble_loop):
1035 pand %xmm0, %xmm3
1036 sub $32, %ecx
1037 pmovmskb %xmm3, %edx
1038 movdqa %xmm0, %xmm1
1039
1040 movdqa 64(%esi), %xmm3
1041 palignr $10,48(%esi), %xmm3
1042 sbb $0xffff, %edx
1043 movdqa 48(%esi), %xmm0
1044 palignr $10,32(%esi), %xmm0
1045 pcmpeqb 32(%edi), %xmm0
1046 lea 32(%esi), %esi
1047 pcmpeqb 48(%edi), %xmm3
1048
1049 lea 32(%edi), %edi
1050 jz L(shr_10_gobble_loop)
1051 pand %xmm0, %xmm3
1052
1053 cmp $0, %ecx
1054 jge L(shr_10_gobble_next)
1055 inc %edx
1056 add $32, %ecx
1057L(shr_10_gobble_next):
1058 test %edx, %edx
1059 jnz L(exit)
1060
1061 pmovmskb %xmm3, %edx
1062 movdqa %xmm0, %xmm1
1063 lea 32(%edi), %edi
1064 lea 32(%esi), %esi
1065 sub $0xffff, %edx
1066 jnz L(exit)
1067
1068 lea (%ecx, %edi,1), %eax
1069 lea 10(%ecx, %esi,1), %edx
1070 POP (%edi)
1071 POP (%esi)
1072 jmp L(less48bytes)
1073
1074 cfi_restore_state
1075 cfi_remember_state
1076 .p2align 4
1077L(shr_11):
1078 cmp $80, %ecx
1079 lea -48(%ecx), %ecx
1080 mov %edx, %eax
1081 jae L(shr_11_gobble)
1082
1083 movdqa 16(%esi), %xmm1
1084 movdqa %xmm1, %xmm2
1085 palignr $11, (%esi), %xmm1
1086 pcmpeqb (%edi), %xmm1
1087
1088 movdqa 32(%esi), %xmm3
1089 palignr $11, %xmm2, %xmm3
1090 pcmpeqb 16(%edi), %xmm3
1091
1092 pand %xmm1, %xmm3
1093 pmovmskb %xmm3, %edx
1094 lea 32(%edi), %edi
1095 lea 32(%esi), %esi
1096 sub $0xffff, %edx
1097 jnz L(exit)
1098 lea (%ecx, %edi,1), %eax
1099 lea 11(%ecx, %esi,1), %edx
1100 POP (%edi)
1101 POP (%esi)
1102 jmp L(less48bytes)
1103
1104 cfi_restore_state
1105 cfi_remember_state
1106 .p2align 4
1107L(shr_11_gobble):
1108 sub $32, %ecx
1109 movdqa 16(%esi), %xmm0
1110 palignr $11, (%esi), %xmm0
1111 pcmpeqb (%edi), %xmm0
1112
1113 movdqa 32(%esi), %xmm3
1114 palignr $11, 16(%esi), %xmm3
1115 pcmpeqb 16(%edi), %xmm3
1116
1117L(shr_11_gobble_loop):
1118 pand %xmm0, %xmm3
1119 sub $32, %ecx
1120 pmovmskb %xmm3, %edx
1121 movdqa %xmm0, %xmm1
1122
1123 movdqa 64(%esi), %xmm3
1124 palignr $11,48(%esi), %xmm3
1125 sbb $0xffff, %edx
1126 movdqa 48(%esi), %xmm0
1127 palignr $11,32(%esi), %xmm0
1128 pcmpeqb 32(%edi), %xmm0
1129 lea 32(%esi), %esi
1130 pcmpeqb 48(%edi), %xmm3
1131
1132 lea 32(%edi), %edi
1133 jz L(shr_11_gobble_loop)
1134 pand %xmm0, %xmm3
1135
1136 cmp $0, %ecx
1137 jge L(shr_11_gobble_next)
1138 inc %edx
1139 add $32, %ecx
1140L(shr_11_gobble_next):
1141 test %edx, %edx
1142 jnz L(exit)
1143
1144 pmovmskb %xmm3, %edx
1145 movdqa %xmm0, %xmm1
1146 lea 32(%edi), %edi
1147 lea 32(%esi), %esi
1148 sub $0xffff, %edx
1149 jnz L(exit)
1150
1151 lea (%ecx, %edi,1), %eax
1152 lea 11(%ecx, %esi,1), %edx
1153 POP (%edi)
1154 POP (%esi)
1155 jmp L(less48bytes)
1156# endif
1157
1158 cfi_restore_state
1159 cfi_remember_state
1160 .p2align 4
1161L(shr_12):
1162 cmp $80, %ecx
1163 lea -48(%ecx), %ecx
1164 mov %edx, %eax
1165 jae L(shr_12_gobble)
1166
1167 movdqa 16(%esi), %xmm1
1168 movdqa %xmm1, %xmm2
1169 palignr $12, (%esi), %xmm1
1170 pcmpeqb (%edi), %xmm1
1171
1172 movdqa 32(%esi), %xmm3
1173 palignr $12, %xmm2, %xmm3
1174 pcmpeqb 16(%edi), %xmm3
1175
1176 pand %xmm1, %xmm3
1177 pmovmskb %xmm3, %edx
1178 lea 32(%edi), %edi
1179 lea 32(%esi), %esi
1180 sub $0xffff, %edx
1181 jnz L(exit)
1182 lea (%ecx, %edi,1), %eax
1183 lea 12(%ecx, %esi,1), %edx
1184 POP (%edi)
1185 POP (%esi)
1186 jmp L(less48bytes)
1187
1188 cfi_restore_state
1189 cfi_remember_state
1190 .p2align 4
1191L(shr_12_gobble):
1192 sub $32, %ecx
1193 movdqa 16(%esi), %xmm0
1194 palignr $12, (%esi), %xmm0
1195 pcmpeqb (%edi), %xmm0
1196
1197 movdqa 32(%esi), %xmm3
1198 palignr $12, 16(%esi), %xmm3
1199 pcmpeqb 16(%edi), %xmm3
1200
1201L(shr_12_gobble_loop):
1202 pand %xmm0, %xmm3
1203 sub $32, %ecx
1204 pmovmskb %xmm3, %edx
1205 movdqa %xmm0, %xmm1
1206
1207 movdqa 64(%esi), %xmm3
1208 palignr $12,48(%esi), %xmm3
1209 sbb $0xffff, %edx
1210 movdqa 48(%esi), %xmm0
1211 palignr $12,32(%esi), %xmm0
1212 pcmpeqb 32(%edi), %xmm0
1213 lea 32(%esi), %esi
1214 pcmpeqb 48(%edi), %xmm3
1215
1216 lea 32(%edi), %edi
1217 jz L(shr_12_gobble_loop)
1218 pand %xmm0, %xmm3
1219
1220 cmp $0, %ecx
1221 jge L(shr_12_gobble_next)
1222 inc %edx
1223 add $32, %ecx
1224L(shr_12_gobble_next):
1225 test %edx, %edx
1226 jnz L(exit)
1227
1228 pmovmskb %xmm3, %edx
1229 movdqa %xmm0, %xmm1
1230 lea 32(%edi), %edi
1231 lea 32(%esi), %esi
1232 sub $0xffff, %edx
1233 jnz L(exit)
1234
1235 lea (%ecx, %edi,1), %eax
1236 lea 12(%ecx, %esi,1), %edx
1237 POP (%edi)
1238 POP (%esi)
1239 jmp L(less48bytes)
1240
1241# ifndef USE_AS_WMEMCMP
1242 cfi_restore_state
1243 cfi_remember_state
1244 .p2align 4
1245L(shr_13):
1246 cmp $80, %ecx
1247 lea -48(%ecx), %ecx
1248 mov %edx, %eax
1249 jae L(shr_13_gobble)
1250
1251 movdqa 16(%esi), %xmm1
1252 movdqa %xmm1, %xmm2
1253 palignr $13, (%esi), %xmm1
1254 pcmpeqb (%edi), %xmm1
1255
1256 movdqa 32(%esi), %xmm3
1257 palignr $13, %xmm2, %xmm3
1258 pcmpeqb 16(%edi), %xmm3
1259
1260 pand %xmm1, %xmm3
1261 pmovmskb %xmm3, %edx
1262 lea 32(%edi), %edi
1263 lea 32(%esi), %esi
1264 sub $0xffff, %edx
1265 jnz L(exit)
1266 lea (%ecx, %edi,1), %eax
1267 lea 13(%ecx, %esi,1), %edx
1268 POP (%edi)
1269 POP (%esi)
1270 jmp L(less48bytes)
1271
1272 cfi_restore_state
1273 cfi_remember_state
1274 .p2align 4
1275L(shr_13_gobble):
1276 sub $32, %ecx
1277 movdqa 16(%esi), %xmm0
1278 palignr $13, (%esi), %xmm0
1279 pcmpeqb (%edi), %xmm0
1280
1281 movdqa 32(%esi), %xmm3
1282 palignr $13, 16(%esi), %xmm3
1283 pcmpeqb 16(%edi), %xmm3
1284
1285L(shr_13_gobble_loop):
1286 pand %xmm0, %xmm3
1287 sub $32, %ecx
1288 pmovmskb %xmm3, %edx
1289 movdqa %xmm0, %xmm1
1290
1291 movdqa 64(%esi), %xmm3
1292 palignr $13,48(%esi), %xmm3
1293 sbb $0xffff, %edx
1294 movdqa 48(%esi), %xmm0
1295 palignr $13,32(%esi), %xmm0
1296 pcmpeqb 32(%edi), %xmm0
1297 lea 32(%esi), %esi
1298 pcmpeqb 48(%edi), %xmm3
1299
1300 lea 32(%edi), %edi
1301 jz L(shr_13_gobble_loop)
1302 pand %xmm0, %xmm3
1303
1304 cmp $0, %ecx
1305 jge L(shr_13_gobble_next)
1306 inc %edx
1307 add $32, %ecx
1308L(shr_13_gobble_next):
1309 test %edx, %edx
1310 jnz L(exit)
1311
1312 pmovmskb %xmm3, %edx
1313 movdqa %xmm0, %xmm1
1314 lea 32(%edi), %edi
1315 lea 32(%esi), %esi
1316 sub $0xffff, %edx
1317 jnz L(exit)
1318
1319 lea (%ecx, %edi,1), %eax
1320 lea 13(%ecx, %esi,1), %edx
1321 POP (%edi)
1322 POP (%esi)
1323 jmp L(less48bytes)
1324
1325 cfi_restore_state
1326 cfi_remember_state
1327 .p2align 4
1328L(shr_14):
1329 cmp $80, %ecx
1330 lea -48(%ecx), %ecx
1331 mov %edx, %eax
1332 jae L(shr_14_gobble)
1333
1334 movdqa 16(%esi), %xmm1
1335 movdqa %xmm1, %xmm2
1336 palignr $14, (%esi), %xmm1
1337 pcmpeqb (%edi), %xmm1
1338
1339 movdqa 32(%esi), %xmm3
1340 palignr $14, %xmm2, %xmm3
1341 pcmpeqb 16(%edi), %xmm3
1342
1343 pand %xmm1, %xmm3
1344 pmovmskb %xmm3, %edx
1345 lea 32(%edi), %edi
1346 lea 32(%esi), %esi
1347 sub $0xffff, %edx
1348 jnz L(exit)
1349 lea (%ecx, %edi,1), %eax
1350 lea 14(%ecx, %esi,1), %edx
1351 POP (%edi)
1352 POP (%esi)
1353 jmp L(less48bytes)
1354
1355 cfi_restore_state
1356 cfi_remember_state
1357 .p2align 4
1358L(shr_14_gobble):
1359 sub $32, %ecx
1360 movdqa 16(%esi), %xmm0
1361 palignr $14, (%esi), %xmm0
1362 pcmpeqb (%edi), %xmm0
1363
1364 movdqa 32(%esi), %xmm3
1365 palignr $14, 16(%esi), %xmm3
1366 pcmpeqb 16(%edi), %xmm3
1367
1368L(shr_14_gobble_loop):
1369 pand %xmm0, %xmm3
1370 sub $32, %ecx
1371 pmovmskb %xmm3, %edx
1372 movdqa %xmm0, %xmm1
1373
1374 movdqa 64(%esi), %xmm3
1375 palignr $14,48(%esi), %xmm3
1376 sbb $0xffff, %edx
1377 movdqa 48(%esi), %xmm0
1378 palignr $14,32(%esi), %xmm0
1379 pcmpeqb 32(%edi), %xmm0
1380 lea 32(%esi), %esi
1381 pcmpeqb 48(%edi), %xmm3
1382
1383 lea 32(%edi), %edi
1384 jz L(shr_14_gobble_loop)
1385 pand %xmm0, %xmm3
1386
1387 cmp $0, %ecx
1388 jge L(shr_14_gobble_next)
1389 inc %edx
1390 add $32, %ecx
1391L(shr_14_gobble_next):
1392 test %edx, %edx
1393 jnz L(exit)
1394
1395 pmovmskb %xmm3, %edx
1396 movdqa %xmm0, %xmm1
1397 lea 32(%edi), %edi
1398 lea 32(%esi), %esi
1399 sub $0xffff, %edx
1400 jnz L(exit)
1401
1402 lea (%ecx, %edi,1), %eax
1403 lea 14(%ecx, %esi,1), %edx
1404 POP (%edi)
1405 POP (%esi)
1406 jmp L(less48bytes)
1407
1408 cfi_restore_state
1409 cfi_remember_state
1410 .p2align 4
1411L(shr_15):
1412 cmp $80, %ecx
1413 lea -48(%ecx), %ecx
1414 mov %edx, %eax
1415 jae L(shr_15_gobble)
1416
1417 movdqa 16(%esi), %xmm1
1418 movdqa %xmm1, %xmm2
1419 palignr $15, (%esi), %xmm1
1420 pcmpeqb (%edi), %xmm1
1421
1422 movdqa 32(%esi), %xmm3
1423 palignr $15, %xmm2, %xmm3
1424 pcmpeqb 16(%edi), %xmm3
1425
1426 pand %xmm1, %xmm3
1427 pmovmskb %xmm3, %edx
1428 lea 32(%edi), %edi
1429 lea 32(%esi), %esi
1430 sub $0xffff, %edx
1431 jnz L(exit)
1432 lea (%ecx, %edi,1), %eax
1433 lea 15(%ecx, %esi,1), %edx
1434 POP (%edi)
1435 POP (%esi)
1436 jmp L(less48bytes)
1437
1438 cfi_restore_state
1439 cfi_remember_state
1440 .p2align 4
1441L(shr_15_gobble):
1442 sub $32, %ecx
1443 movdqa 16(%esi), %xmm0
1444 palignr $15, (%esi), %xmm0
1445 pcmpeqb (%edi), %xmm0
1446
1447 movdqa 32(%esi), %xmm3
1448 palignr $15, 16(%esi), %xmm3
1449 pcmpeqb 16(%edi), %xmm3
1450
1451L(shr_15_gobble_loop):
1452 pand %xmm0, %xmm3
1453 sub $32, %ecx
1454 pmovmskb %xmm3, %edx
1455 movdqa %xmm0, %xmm1
1456
1457 movdqa 64(%esi), %xmm3
1458 palignr $15,48(%esi), %xmm3
1459 sbb $0xffff, %edx
1460 movdqa 48(%esi), %xmm0
1461 palignr $15,32(%esi), %xmm0
1462 pcmpeqb 32(%edi), %xmm0
1463 lea 32(%esi), %esi
1464 pcmpeqb 48(%edi), %xmm3
1465
1466 lea 32(%edi), %edi
1467 jz L(shr_15_gobble_loop)
1468 pand %xmm0, %xmm3
1469
1470 cmp $0, %ecx
1471 jge L(shr_15_gobble_next)
1472 inc %edx
1473 add $32, %ecx
1474L(shr_15_gobble_next):
1475 test %edx, %edx
1476 jnz L(exit)
1477
1478 pmovmskb %xmm3, %edx
1479 movdqa %xmm0, %xmm1
1480 lea 32(%edi), %edi
1481 lea 32(%esi), %esi
1482 sub $0xffff, %edx
1483 jnz L(exit)
1484
1485 lea (%ecx, %edi,1), %eax
1486 lea 15(%ecx, %esi,1), %edx
1487 POP (%edi)
1488 POP (%esi)
1489 jmp L(less48bytes)
1490# endif
1491
1492 cfi_restore_state
1493 cfi_remember_state
1494 .p2align 4
1495L(exit):
1496 pmovmskb %xmm1, %ebx
1497 sub $0xffff, %ebx
1498 jz L(first16bytes)
1499 lea -16(%esi), %esi
1500 lea -16(%edi), %edi
1501 mov %ebx, %edx
1502
1503L(first16bytes):
1504 add %eax, %esi
1505L(less16bytes):
1506
1507# ifndef USE_AS_WMEMCMP
1508 test %dl, %dl
1509 jz L(next_24_bytes)
1510
1511 test $0x01, %dl
1512 jnz L(Byte16)
1513
1514 test $0x02, %dl
1515 jnz L(Byte17)
1516
1517 test $0x04, %dl
1518 jnz L(Byte18)
1519
1520 test $0x08, %dl
1521 jnz L(Byte19)
1522
1523 test $0x10, %dl
1524 jnz L(Byte20)
1525
1526 test $0x20, %dl
1527 jnz L(Byte21)
1528
1529 test $0x40, %dl
1530 jnz L(Byte22)
1531L(Byte23):
1532 movzbl -9(%edi), %eax
1533 movzbl -9(%esi), %edx
1534 sub %edx, %eax
1535 RETURN
1536
1537 .p2align 4
1538L(Byte16):
1539 movzbl -16(%edi), %eax
1540 movzbl -16(%esi), %edx
1541 sub %edx, %eax
1542 RETURN
1543
1544 .p2align 4
1545L(Byte17):
1546 movzbl -15(%edi), %eax
1547 movzbl -15(%esi), %edx
1548 sub %edx, %eax
1549 RETURN
1550
1551 .p2align 4
1552L(Byte18):
1553 movzbl -14(%edi), %eax
1554 movzbl -14(%esi), %edx
1555 sub %edx, %eax
1556 RETURN
1557
1558 .p2align 4
1559L(Byte19):
1560 movzbl -13(%edi), %eax
1561 movzbl -13(%esi), %edx
1562 sub %edx, %eax
1563 RETURN
1564
1565 .p2align 4
1566L(Byte20):
1567 movzbl -12(%edi), %eax
1568 movzbl -12(%esi), %edx
1569 sub %edx, %eax
1570 RETURN
1571
1572 .p2align 4
1573L(Byte21):
1574 movzbl -11(%edi), %eax
1575 movzbl -11(%esi), %edx
1576 sub %edx, %eax
1577 RETURN
1578
1579 .p2align 4
1580L(Byte22):
1581 movzbl -10(%edi), %eax
1582 movzbl -10(%esi), %edx
1583 sub %edx, %eax
1584 RETURN
1585
1586 .p2align 4
1587L(next_24_bytes):
1588 lea 8(%edi), %edi
1589 lea 8(%esi), %esi
1590 test $0x01, %dh
1591 jnz L(Byte16)
1592
1593 test $0x02, %dh
1594 jnz L(Byte17)
1595
1596 test $0x04, %dh
1597 jnz L(Byte18)
1598
1599 test $0x08, %dh
1600 jnz L(Byte19)
1601
1602 test $0x10, %dh
1603 jnz L(Byte20)
1604
1605 test $0x20, %dh
1606 jnz L(Byte21)
1607
1608 test $0x40, %dh
1609 jnz L(Byte22)
1610
1611 .p2align 4
1612L(Byte31):
1613 movzbl -9(%edi), %eax
1614 movzbl -9(%esi), %edx
1615 sub %edx, %eax
1616 RETURN_END
1617# else
1618
1619/* special for wmemcmp */
1620 xor %eax, %eax
1621 test %dl, %dl
1622 jz L(next_two_double_words)
1623 and $15, %dl
1624 jz L(second_double_word)
1625 mov -16(%edi), %eax
1626 cmp -16(%esi), %eax
1627 jne L(nequal)
1628 RETURN
1629
1630 .p2align 4
1631L(second_double_word):
1632 mov -12(%edi), %eax
1633 cmp -12(%esi), %eax
1634 jne L(nequal)
1635 RETURN
1636
1637 .p2align 4
1638L(next_two_double_words):
1639 and $15, %dh
1640 jz L(fourth_double_word)
1641 mov -8(%edi), %eax
1642 cmp -8(%esi), %eax
1643 jne L(nequal)
1644 RETURN
1645
1646 .p2align 4
1647L(fourth_double_word):
1648 mov -4(%edi), %eax
1649 cmp -4(%esi), %eax
1650 jne L(nequal)
1651 RETURN
1652
1653 .p2align 4
1654L(nequal):
1655 mov $1, %eax
1656 jg L(nequal_bigger)
1657 neg %eax
1658 RETURN
1659
1660 .p2align 4
1661L(nequal_bigger):
1662 RETURN_END
1663# endif
1664
1665 CFI_PUSH (%ebx)
1666
1667 .p2align 4
1668L(more8bytes):
1669 cmp $16, %ecx
1670 jae L(more16bytes)
1671 cmp $8, %ecx
1672 je L(8bytes)
1673# ifndef USE_AS_WMEMCMP
1674 cmp $9, %ecx
1675 je L(9bytes)
1676 cmp $10, %ecx
1677 je L(10bytes)
1678 cmp $11, %ecx
1679 je L(11bytes)
1680 cmp $12, %ecx
1681 je L(12bytes)
1682 cmp $13, %ecx
1683 je L(13bytes)
1684 cmp $14, %ecx
1685 je L(14bytes)
1686 jmp L(15bytes)
1687# else
1688 jmp L(12bytes)
1689# endif
1690
1691 .p2align 4
1692L(more16bytes):
1693 cmp $24, %ecx
1694 jae L(more24bytes)
1695 cmp $16, %ecx
1696 je L(16bytes)
1697# ifndef USE_AS_WMEMCMP
1698 cmp $17, %ecx
1699 je L(17bytes)
1700 cmp $18, %ecx
1701 je L(18bytes)
1702 cmp $19, %ecx
1703 je L(19bytes)
1704 cmp $20, %ecx
1705 je L(20bytes)
1706 cmp $21, %ecx
1707 je L(21bytes)
1708 cmp $22, %ecx
1709 je L(22bytes)
1710 jmp L(23bytes)
1711# else
1712 jmp L(20bytes)
1713# endif
1714
1715 .p2align 4
1716L(more24bytes):
1717 cmp $32, %ecx
1718 jae L(more32bytes)
1719 cmp $24, %ecx
1720 je L(24bytes)
1721# ifndef USE_AS_WMEMCMP
1722 cmp $25, %ecx
1723 je L(25bytes)
1724 cmp $26, %ecx
1725 je L(26bytes)
1726 cmp $27, %ecx
1727 je L(27bytes)
1728 cmp $28, %ecx
1729 je L(28bytes)
1730 cmp $29, %ecx
1731 je L(29bytes)
1732 cmp $30, %ecx
1733 je L(30bytes)
1734 jmp L(31bytes)
1735# else
1736 jmp L(28bytes)
1737# endif
1738
1739 .p2align 4
1740L(more32bytes):
1741 cmp $40, %ecx
1742 jae L(more40bytes)
1743 cmp $32, %ecx
1744 je L(32bytes)
1745# ifndef USE_AS_WMEMCMP
1746 cmp $33, %ecx
1747 je L(33bytes)
1748 cmp $34, %ecx
1749 je L(34bytes)
1750 cmp $35, %ecx
1751 je L(35bytes)
1752 cmp $36, %ecx
1753 je L(36bytes)
1754 cmp $37, %ecx
1755 je L(37bytes)
1756 cmp $38, %ecx
1757 je L(38bytes)
1758 jmp L(39bytes)
1759# else
1760 jmp L(36bytes)
1761# endif
1762
1763 .p2align 4
1764L(less48bytes):
1765 cmp $8, %ecx
1766 jae L(more8bytes)
1767# ifndef USE_AS_WMEMCMP
1768 cmp $2, %ecx
1769 je L(2bytes)
1770 cmp $3, %ecx
1771 je L(3bytes)
1772 cmp $4, %ecx
1773 je L(4bytes)
1774 cmp $5, %ecx
1775 je L(5bytes)
1776 cmp $6, %ecx
1777 je L(6bytes)
1778 jmp L(7bytes)
1779# else
1780 jmp L(4bytes)
1781# endif
1782
1783 .p2align 4
1784L(more40bytes):
1785 cmp $40, %ecx
1786 je L(40bytes)
1787# ifndef USE_AS_WMEMCMP
1788 cmp $41, %ecx
1789 je L(41bytes)
1790 cmp $42, %ecx
1791 je L(42bytes)
1792 cmp $43, %ecx
1793 je L(43bytes)
1794 cmp $44, %ecx
1795 je L(44bytes)
1796 cmp $45, %ecx
1797 je L(45bytes)
1798 cmp $46, %ecx
1799 je L(46bytes)
1800 jmp L(47bytes)
1801
1802 .p2align 4
1803L(44bytes):
1804 mov -44(%eax), %ecx
1805 mov -44(%edx), %ebx
1806 cmp %ebx, %ecx
1807 jne L(find_diff)
1808L(40bytes):
1809 mov -40(%eax), %ecx
1810 mov -40(%edx), %ebx
1811 cmp %ebx, %ecx
1812 jne L(find_diff)
1813L(36bytes):
1814 mov -36(%eax), %ecx
1815 mov -36(%edx), %ebx
1816 cmp %ebx, %ecx
1817 jne L(find_diff)
1818L(32bytes):
1819 mov -32(%eax), %ecx
1820 mov -32(%edx), %ebx
1821 cmp %ebx, %ecx
1822 jne L(find_diff)
1823L(28bytes):
1824 mov -28(%eax), %ecx
1825 mov -28(%edx), %ebx
1826 cmp %ebx, %ecx
1827 jne L(find_diff)
1828L(24bytes):
1829 mov -24(%eax), %ecx
1830 mov -24(%edx), %ebx
1831 cmp %ebx, %ecx
1832 jne L(find_diff)
1833L(20bytes):
1834 mov -20(%eax), %ecx
1835 mov -20(%edx), %ebx
1836 cmp %ebx, %ecx
1837 jne L(find_diff)
1838L(16bytes):
1839 mov -16(%eax), %ecx
1840 mov -16(%edx), %ebx
1841 cmp %ebx, %ecx
1842 jne L(find_diff)
1843L(12bytes):
1844 mov -12(%eax), %ecx
1845 mov -12(%edx), %ebx
1846 cmp %ebx, %ecx
1847 jne L(find_diff)
1848L(8bytes):
1849 mov -8(%eax), %ecx
1850 mov -8(%edx), %ebx
1851 cmp %ebx, %ecx
1852 jne L(find_diff)
1853L(4bytes):
1854 mov -4(%eax), %ecx
1855 mov -4(%edx), %ebx
1856 cmp %ebx, %ecx
1857 mov $0, %eax
1858 jne L(find_diff)
1859 POP (%ebx)
1860 ret
1861 CFI_PUSH (%ebx)
1862# else
1863 .p2align 4
1864L(44bytes):
1865 mov -44(%eax), %ecx
1866 cmp -44(%edx), %ecx
1867 jne L(find_diff)
1868L(40bytes):
1869 mov -40(%eax), %ecx
1870 cmp -40(%edx), %ecx
1871 jne L(find_diff)
1872L(36bytes):
1873 mov -36(%eax), %ecx
1874 cmp -36(%edx), %ecx
1875 jne L(find_diff)
1876L(32bytes):
1877 mov -32(%eax), %ecx
1878 cmp -32(%edx), %ecx
1879 jne L(find_diff)
1880L(28bytes):
1881 mov -28(%eax), %ecx
1882 cmp -28(%edx), %ecx
1883 jne L(find_diff)
1884L(24bytes):
1885 mov -24(%eax), %ecx
1886 cmp -24(%edx), %ecx
1887 jne L(find_diff)
1888L(20bytes):
1889 mov -20(%eax), %ecx
1890 cmp -20(%edx), %ecx
1891 jne L(find_diff)
1892L(16bytes):
1893 mov -16(%eax), %ecx
1894 cmp -16(%edx), %ecx
1895 jne L(find_diff)
1896L(12bytes):
1897 mov -12(%eax), %ecx
1898 cmp -12(%edx), %ecx
1899 jne L(find_diff)
1900L(8bytes):
1901 mov -8(%eax), %ecx
1902 cmp -8(%edx), %ecx
1903 jne L(find_diff)
1904L(4bytes):
1905 mov -4(%eax), %ecx
1906 xor %eax, %eax
1907 cmp -4(%edx), %ecx
1908 jne L(find_diff)
1909 POP (%ebx)
1910 ret
1911 CFI_PUSH (%ebx)
1912# endif
1913
1914# ifndef USE_AS_WMEMCMP
1915
1916 .p2align 4
1917L(45bytes):
1918 mov -45(%eax), %ecx
1919 mov -45(%edx), %ebx
1920 cmp %ebx, %ecx
1921 jne L(find_diff)
1922L(41bytes):
1923 mov -41(%eax), %ecx
1924 mov -41(%edx), %ebx
1925 cmp %ebx, %ecx
1926 jne L(find_diff)
1927L(37bytes):
1928 mov -37(%eax), %ecx
1929 mov -37(%edx), %ebx
1930 cmp %ebx, %ecx
1931 jne L(find_diff)
1932L(33bytes):
1933 mov -33(%eax), %ecx
1934 mov -33(%edx), %ebx
1935 cmp %ebx, %ecx
1936 jne L(find_diff)
1937L(29bytes):
1938 mov -29(%eax), %ecx
1939 mov -29(%edx), %ebx
1940 cmp %ebx, %ecx
1941 jne L(find_diff)
1942L(25bytes):
1943 mov -25(%eax), %ecx
1944 mov -25(%edx), %ebx
1945 cmp %ebx, %ecx
1946 jne L(find_diff)
1947L(21bytes):
1948 mov -21(%eax), %ecx
1949 mov -21(%edx), %ebx
1950 cmp %ebx, %ecx
1951 jne L(find_diff)
1952L(17bytes):
1953 mov -17(%eax), %ecx
1954 mov -17(%edx), %ebx
1955 cmp %ebx, %ecx
1956 jne L(find_diff)
1957L(13bytes):
1958 mov -13(%eax), %ecx
1959 mov -13(%edx), %ebx
1960 cmp %ebx, %ecx
1961 jne L(find_diff)
1962L(9bytes):
1963 mov -9(%eax), %ecx
1964 mov -9(%edx), %ebx
1965 cmp %ebx, %ecx
1966 jne L(find_diff)
1967L(5bytes):
1968 mov -5(%eax), %ecx
1969 mov -5(%edx), %ebx
1970 cmp %ebx, %ecx
1971 jne L(find_diff)
1972 movzbl -1(%eax), %ecx
1973 cmp -1(%edx), %cl
1974 mov $0, %eax
1975 jne L(end)
1976 POP (%ebx)
1977 ret
1978 CFI_PUSH (%ebx)
1979
1980 .p2align 4
1981L(46bytes):
1982 mov -46(%eax), %ecx
1983 mov -46(%edx), %ebx
1984 cmp %ebx, %ecx
1985 jne L(find_diff)
1986L(42bytes):
1987 mov -42(%eax), %ecx
1988 mov -42(%edx), %ebx
1989 cmp %ebx, %ecx
1990 jne L(find_diff)
1991L(38bytes):
1992 mov -38(%eax), %ecx
1993 mov -38(%edx), %ebx
1994 cmp %ebx, %ecx
1995 jne L(find_diff)
1996L(34bytes):
1997 mov -34(%eax), %ecx
1998 mov -34(%edx), %ebx
1999 cmp %ebx, %ecx
2000 jne L(find_diff)
2001L(30bytes):
2002 mov -30(%eax), %ecx
2003 mov -30(%edx), %ebx
2004 cmp %ebx, %ecx
2005 jne L(find_diff)
2006L(26bytes):
2007 mov -26(%eax), %ecx
2008 mov -26(%edx), %ebx
2009 cmp %ebx, %ecx
2010 jne L(find_diff)
2011L(22bytes):
2012 mov -22(%eax), %ecx
2013 mov -22(%edx), %ebx
2014 cmp %ebx, %ecx
2015 jne L(find_diff)
2016L(18bytes):
2017 mov -18(%eax), %ecx
2018 mov -18(%edx), %ebx
2019 cmp %ebx, %ecx
2020 jne L(find_diff)
2021L(14bytes):
2022 mov -14(%eax), %ecx
2023 mov -14(%edx), %ebx
2024 cmp %ebx, %ecx
2025 jne L(find_diff)
2026L(10bytes):
2027 mov -10(%eax), %ecx
2028 mov -10(%edx), %ebx
2029 cmp %ebx, %ecx
2030 jne L(find_diff)
2031L(6bytes):
2032 mov -6(%eax), %ecx
2033 mov -6(%edx), %ebx
2034 cmp %ebx, %ecx
2035 jne L(find_diff)
2036L(2bytes):
2037 movzwl -2(%eax), %ecx
2038 movzwl -2(%edx), %ebx
2039 cmp %bl, %cl
2040 jne L(end)
2041 cmp %bh, %ch
2042 mov $0, %eax
2043 jne L(end)
2044 POP (%ebx)
2045 ret
2046 CFI_PUSH (%ebx)
2047
2048 .p2align 4
2049L(47bytes):
2050 movl -47(%eax), %ecx
2051 movl -47(%edx), %ebx
2052 cmp %ebx, %ecx
2053 jne L(find_diff)
2054L(43bytes):
2055 movl -43(%eax), %ecx
2056 movl -43(%edx), %ebx
2057 cmp %ebx, %ecx
2058 jne L(find_diff)
2059L(39bytes):
2060 movl -39(%eax), %ecx
2061 movl -39(%edx), %ebx
2062 cmp %ebx, %ecx
2063 jne L(find_diff)
2064L(35bytes):
2065 movl -35(%eax), %ecx
2066 movl -35(%edx), %ebx
2067 cmp %ebx, %ecx
2068 jne L(find_diff)
2069L(31bytes):
2070 movl -31(%eax), %ecx
2071 movl -31(%edx), %ebx
2072 cmp %ebx, %ecx
2073 jne L(find_diff)
2074L(27bytes):
2075 movl -27(%eax), %ecx
2076 movl -27(%edx), %ebx
2077 cmp %ebx, %ecx
2078 jne L(find_diff)
2079L(23bytes):
2080 movl -23(%eax), %ecx
2081 movl -23(%edx), %ebx
2082 cmp %ebx, %ecx
2083 jne L(find_diff)
2084L(19bytes):
2085 movl -19(%eax), %ecx
2086 movl -19(%edx), %ebx
2087 cmp %ebx, %ecx
2088 jne L(find_diff)
2089L(15bytes):
2090 movl -15(%eax), %ecx
2091 movl -15(%edx), %ebx
2092 cmp %ebx, %ecx
2093 jne L(find_diff)
2094L(11bytes):
2095 movl -11(%eax), %ecx
2096 movl -11(%edx), %ebx
2097 cmp %ebx, %ecx
2098 jne L(find_diff)
2099L(7bytes):
2100 movl -7(%eax), %ecx
2101 movl -7(%edx), %ebx
2102 cmp %ebx, %ecx
2103 jne L(find_diff)
2104L(3bytes):
2105 movzwl -3(%eax), %ecx
2106 movzwl -3(%edx), %ebx
2107 cmpb %bl, %cl
2108 jne L(end)
2109 cmp %bx, %cx
2110 jne L(end)
2111 movzbl -1(%eax), %eax
2112 cmpb -1(%edx), %al
2113 mov $0, %eax
2114 jne L(end)
2115 POP (%ebx)
2116 ret
2117 CFI_PUSH (%ebx)
2118
2119 .p2align 4
2120L(find_diff):
2121 cmpb %bl, %cl
2122 jne L(end)
2123 cmp %bx, %cx
2124 jne L(end)
2125 shr $16,%ecx
2126 shr $16,%ebx
2127 cmp %bl, %cl
2128 jne L(end)
2129 cmp %bx, %cx
2130
2131 .p2align 4
2132L(end):
2133 POP (%ebx)
2134 mov $1, %eax
2135 ja L(bigger)
2136 neg %eax
2137L(bigger):
2138 ret
2139# else
2140
2141/* for wmemcmp */
2142 .p2align 4
2143L(find_diff):
2144 POP (%ebx)
2145 mov $1, %eax
2146 jg L(find_diff_bigger)
2147 neg %eax
2148 ret
2149
2150 .p2align 4
2151L(find_diff_bigger):
2152 ret
2153
2154# endif
2155END (MEMCMP)
2156#endif
2157

source code of glibc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S