1/* memcmp with SSSE3, wmemcmp with SSSE3
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCMP
24# define MEMCMP __memcmp_ssse3
25# endif
26
27/* Warning!
28 wmemcmp has to use SIGNED comparison for elements.
29 memcmp has to use UNSIGNED comparison for elemnts.
30*/
31
32 atom_text_section
33ENTRY (MEMCMP)
34# ifdef USE_AS_WMEMCMP
35 shl $2, %RDX_LP
36 test %RDX_LP, %RDX_LP
37 jz L(equal)
38# elif defined __ILP32__
39 /* Clear the upper 32 bits. */
40 mov %edx, %edx
41# endif
42 mov %rdx, %rcx
43 mov %rdi, %rdx
44 cmp $48, %rcx;
45 jae L(48bytesormore) /* LEN => 48 */
46
47 add %rcx, %rsi
48 add %rcx, %rdi
49 jmp L(less48bytes)
50
51 .p2align 4
52/* ECX >= 32. */
53L(48bytesormore):
54 movdqu (%rdi), %xmm3
55 movdqu (%rsi), %xmm0
56 pcmpeqb %xmm0, %xmm3
57 pmovmskb %xmm3, %edx
58 lea 16(%rdi), %rdi
59 lea 16(%rsi), %rsi
60 sub $0xffff, %edx
61 jnz L(less16bytes)
62 mov %edi, %edx
63 and $0xf, %edx
64 xor %rdx, %rdi
65 sub %rdx, %rsi
66 add %rdx, %rcx
67 mov %esi, %edx
68 and $0xf, %edx
69 jz L(shr_0)
70 xor %rdx, %rsi
71
72# ifndef USE_AS_WMEMCMP
73 cmp $8, %edx
74 jae L(next_unaligned_table)
75 cmp $0, %edx
76 je L(shr_0)
77 cmp $1, %edx
78 je L(shr_1)
79 cmp $2, %edx
80 je L(shr_2)
81 cmp $3, %edx
82 je L(shr_3)
83 cmp $4, %edx
84 je L(shr_4)
85 cmp $5, %edx
86 je L(shr_5)
87 cmp $6, %edx
88 je L(shr_6)
89 jmp L(shr_7)
90
91 .p2align 2
92L(next_unaligned_table):
93 cmp $8, %edx
94 je L(shr_8)
95 cmp $9, %edx
96 je L(shr_9)
97 cmp $10, %edx
98 je L(shr_10)
99 cmp $11, %edx
100 je L(shr_11)
101 cmp $12, %edx
102 je L(shr_12)
103 cmp $13, %edx
104 je L(shr_13)
105 cmp $14, %edx
106 je L(shr_14)
107 jmp L(shr_15)
108# else
109 cmp $0, %edx
110 je L(shr_0)
111 cmp $4, %edx
112 je L(shr_4)
113 cmp $8, %edx
114 je L(shr_8)
115 jmp L(shr_12)
116# endif
117
118 .p2align 4
119L(shr_0):
120 cmp $80, %rcx
121 lea -48(%rcx), %rcx
122 jae L(shr_0_gobble)
123 xor %eax, %eax
124 movdqa (%rsi), %xmm1
125 pcmpeqb (%rdi), %xmm1
126 movdqa 16(%rsi), %xmm2
127 pcmpeqb 16(%rdi), %xmm2
128 pand %xmm1, %xmm2
129 pmovmskb %xmm2, %edx
130 lea 32(%rdi), %rdi
131 lea 32(%rsi), %rsi
132 sub $0xffff, %edx
133 jnz L(exit)
134 add %rcx, %rsi
135 add %rcx, %rdi
136 jmp L(less48bytes)
137
138 .p2align 4
139L(shr_0_gobble):
140 movdqa (%rsi), %xmm0
141 xor %eax, %eax
142 pcmpeqb (%rdi), %xmm0
143 sub $32, %rcx
144 movdqa 16(%rsi), %xmm2
145 pcmpeqb 16(%rdi), %xmm2
146L(shr_0_gobble_loop):
147 pand %xmm0, %xmm2
148 sub $32, %rcx
149 pmovmskb %xmm2, %edx
150 movdqa %xmm0, %xmm1
151 movdqa 32(%rsi), %xmm0
152 movdqa 48(%rsi), %xmm2
153 sbb $0xffff, %edx
154 pcmpeqb 32(%rdi), %xmm0
155 pcmpeqb 48(%rdi), %xmm2
156 lea 32(%rdi), %rdi
157 lea 32(%rsi), %rsi
158 jz L(shr_0_gobble_loop)
159
160 pand %xmm0, %xmm2
161 cmp $0, %rcx
162 jge L(next)
163 inc %edx
164 add $32, %rcx
165L(next):
166 test %edx, %edx
167 jnz L(exit)
168
169 pmovmskb %xmm2, %edx
170 movdqa %xmm0, %xmm1
171 lea 32(%rdi), %rdi
172 lea 32(%rsi), %rsi
173 sub $0xffff, %edx
174 jnz L(exit)
175 add %rcx, %rsi
176 add %rcx, %rdi
177 jmp L(less48bytes)
178
179# ifndef USE_AS_WMEMCMP
180
181 .p2align 4
182L(shr_1):
183 cmp $80, %rcx
184 lea -48(%rcx), %rcx
185 mov %edx, %eax
186 jae L(shr_1_gobble)
187
188 movdqa 16(%rsi), %xmm1
189 movdqa %xmm1, %xmm2
190 palignr $1, (%rsi), %xmm1
191 pcmpeqb (%rdi), %xmm1
192
193 movdqa 32(%rsi), %xmm3
194 palignr $1, %xmm2, %xmm3
195 pcmpeqb 16(%rdi), %xmm3
196
197 pand %xmm1, %xmm3
198 pmovmskb %xmm3, %edx
199 lea 32(%rdi), %rdi
200 lea 32(%rsi), %rsi
201 sub $0xffff, %edx
202 jnz L(exit)
203 add $1, %rsi
204 add %rcx, %rsi
205 add %rcx, %rdi
206 jmp L(less48bytes)
207
208 .p2align 4
209L(shr_1_gobble):
210 sub $32, %rcx
211 movdqa 16(%rsi), %xmm0
212 palignr $1, (%rsi), %xmm0
213 pcmpeqb (%rdi), %xmm0
214
215 movdqa 32(%rsi), %xmm3
216 palignr $1, 16(%rsi), %xmm3
217 pcmpeqb 16(%rdi), %xmm3
218
219L(shr_1_gobble_loop):
220 pand %xmm0, %xmm3
221 sub $32, %rcx
222 pmovmskb %xmm3, %edx
223 movdqa %xmm0, %xmm1
224
225 movdqa 64(%rsi), %xmm3
226 palignr $1, 48(%rsi), %xmm3
227 sbb $0xffff, %edx
228 movdqa 48(%rsi), %xmm0
229 palignr $1, 32(%rsi), %xmm0
230 pcmpeqb 32(%rdi), %xmm0
231 lea 32(%rsi), %rsi
232 pcmpeqb 48(%rdi), %xmm3
233
234 lea 32(%rdi), %rdi
235 jz L(shr_1_gobble_loop)
236 pand %xmm0, %xmm3
237
238 cmp $0, %rcx
239 jge L(shr_1_gobble_next)
240 inc %edx
241 add $32, %rcx
242L(shr_1_gobble_next):
243 test %edx, %edx
244 jnz L(exit)
245
246 pmovmskb %xmm3, %edx
247 movdqa %xmm0, %xmm1
248 lea 32(%rdi), %rdi
249 lea 32(%rsi), %rsi
250 sub $0xffff, %edx
251 jnz L(exit)
252
253 lea 1(%rsi), %rsi
254 add %rcx, %rsi
255 add %rcx, %rdi
256 jmp L(less48bytes)
257
258
259 .p2align 4
260L(shr_2):
261 cmp $80, %rcx
262 lea -48(%rcx), %rcx
263 mov %edx, %eax
264 jae L(shr_2_gobble)
265
266 movdqa 16(%rsi), %xmm1
267 movdqa %xmm1, %xmm2
268 palignr $2, (%rsi), %xmm1
269 pcmpeqb (%rdi), %xmm1
270
271 movdqa 32(%rsi), %xmm3
272 palignr $2, %xmm2, %xmm3
273 pcmpeqb 16(%rdi), %xmm3
274
275 pand %xmm1, %xmm3
276 pmovmskb %xmm3, %edx
277 lea 32(%rdi), %rdi
278 lea 32(%rsi), %rsi
279 sub $0xffff, %edx
280 jnz L(exit)
281 add $2, %rsi
282 add %rcx, %rsi
283 add %rcx, %rdi
284 jmp L(less48bytes)
285
286 .p2align 4
287L(shr_2_gobble):
288 sub $32, %rcx
289 movdqa 16(%rsi), %xmm0
290 palignr $2, (%rsi), %xmm0
291 pcmpeqb (%rdi), %xmm0
292
293 movdqa 32(%rsi), %xmm3
294 palignr $2, 16(%rsi), %xmm3
295 pcmpeqb 16(%rdi), %xmm3
296
297L(shr_2_gobble_loop):
298 pand %xmm0, %xmm3
299 sub $32, %rcx
300 pmovmskb %xmm3, %edx
301 movdqa %xmm0, %xmm1
302
303 movdqa 64(%rsi), %xmm3
304 palignr $2, 48(%rsi), %xmm3
305 sbb $0xffff, %edx
306 movdqa 48(%rsi), %xmm0
307 palignr $2, 32(%rsi), %xmm0
308 pcmpeqb 32(%rdi), %xmm0
309 lea 32(%rsi), %rsi
310 pcmpeqb 48(%rdi), %xmm3
311
312 lea 32(%rdi), %rdi
313 jz L(shr_2_gobble_loop)
314 pand %xmm0, %xmm3
315
316 cmp $0, %rcx
317 jge L(shr_2_gobble_next)
318 inc %edx
319 add $32, %rcx
320L(shr_2_gobble_next):
321 test %edx, %edx
322 jnz L(exit)
323
324 pmovmskb %xmm3, %edx
325 movdqa %xmm0, %xmm1
326 lea 32(%rdi), %rdi
327 lea 32(%rsi), %rsi
328 sub $0xffff, %edx
329 jnz L(exit)
330
331 lea 2(%rsi), %rsi
332 add %rcx, %rsi
333 add %rcx, %rdi
334 jmp L(less48bytes)
335
336 .p2align 4
337L(shr_3):
338 cmp $80, %rcx
339 lea -48(%rcx), %rcx
340 mov %edx, %eax
341 jae L(shr_3_gobble)
342
343 movdqa 16(%rsi), %xmm1
344 movdqa %xmm1, %xmm2
345 palignr $3, (%rsi), %xmm1
346 pcmpeqb (%rdi), %xmm1
347
348 movdqa 32(%rsi), %xmm3
349 palignr $3, %xmm2, %xmm3
350 pcmpeqb 16(%rdi), %xmm3
351
352 pand %xmm1, %xmm3
353 pmovmskb %xmm3, %edx
354 lea 32(%rdi), %rdi
355 lea 32(%rsi), %rsi
356 sub $0xffff, %edx
357 jnz L(exit)
358 add $3, %rsi
359 add %rcx, %rsi
360 add %rcx, %rdi
361 jmp L(less48bytes)
362
363 .p2align 4
364L(shr_3_gobble):
365 sub $32, %rcx
366 movdqa 16(%rsi), %xmm0
367 palignr $3, (%rsi), %xmm0
368 pcmpeqb (%rdi), %xmm0
369
370 movdqa 32(%rsi), %xmm3
371 palignr $3, 16(%rsi), %xmm3
372 pcmpeqb 16(%rdi), %xmm3
373
374L(shr_3_gobble_loop):
375 pand %xmm0, %xmm3
376 sub $32, %rcx
377 pmovmskb %xmm3, %edx
378 movdqa %xmm0, %xmm1
379
380 movdqa 64(%rsi), %xmm3
381 palignr $3, 48(%rsi), %xmm3
382 sbb $0xffff, %edx
383 movdqa 48(%rsi), %xmm0
384 palignr $3, 32(%rsi), %xmm0
385 pcmpeqb 32(%rdi), %xmm0
386 lea 32(%rsi), %rsi
387 pcmpeqb 48(%rdi), %xmm3
388
389 lea 32(%rdi), %rdi
390 jz L(shr_3_gobble_loop)
391 pand %xmm0, %xmm3
392
393 cmp $0, %rcx
394 jge L(shr_3_gobble_next)
395 inc %edx
396 add $32, %rcx
397L(shr_3_gobble_next):
398 test %edx, %edx
399 jnz L(exit)
400
401 pmovmskb %xmm3, %edx
402 movdqa %xmm0, %xmm1
403 lea 32(%rdi), %rdi
404 lea 32(%rsi), %rsi
405 sub $0xffff, %edx
406 jnz L(exit)
407
408 lea 3(%rsi), %rsi
409 add %rcx, %rsi
410 add %rcx, %rdi
411 jmp L(less48bytes)
412
413# endif
414
415 .p2align 4
416L(shr_4):
417 cmp $80, %rcx
418 lea -48(%rcx), %rcx
419 mov %edx, %eax
420 jae L(shr_4_gobble)
421
422 movdqa 16(%rsi), %xmm1
423 movdqa %xmm1, %xmm2
424 palignr $4, (%rsi), %xmm1
425 pcmpeqb (%rdi), %xmm1
426
427 movdqa 32(%rsi), %xmm3
428 palignr $4, %xmm2, %xmm3
429 pcmpeqb 16(%rdi), %xmm3
430
431 pand %xmm1, %xmm3
432 pmovmskb %xmm3, %edx
433 lea 32(%rdi), %rdi
434 lea 32(%rsi), %rsi
435 sub $0xffff, %edx
436 jnz L(exit)
437 add $4, %rsi
438 add %rcx, %rsi
439 add %rcx, %rdi
440 jmp L(less48bytes)
441
442 .p2align 4
443L(shr_4_gobble):
444 sub $32, %rcx
445 movdqa 16(%rsi), %xmm0
446 palignr $4, (%rsi), %xmm0
447 pcmpeqb (%rdi), %xmm0
448
449 movdqa 32(%rsi), %xmm3
450 palignr $4, 16(%rsi), %xmm3
451 pcmpeqb 16(%rdi), %xmm3
452
453L(shr_4_gobble_loop):
454 pand %xmm0, %xmm3
455 sub $32, %rcx
456 pmovmskb %xmm3, %edx
457 movdqa %xmm0, %xmm1
458
459 movdqa 64(%rsi), %xmm3
460 palignr $4, 48(%rsi), %xmm3
461 sbb $0xffff, %edx
462 movdqa 48(%rsi), %xmm0
463 palignr $4, 32(%rsi), %xmm0
464 pcmpeqb 32(%rdi), %xmm0
465 lea 32(%rsi), %rsi
466 pcmpeqb 48(%rdi), %xmm3
467
468 lea 32(%rdi), %rdi
469 jz L(shr_4_gobble_loop)
470 pand %xmm0, %xmm3
471
472 cmp $0, %rcx
473 jge L(shr_4_gobble_next)
474 inc %edx
475 add $32, %rcx
476L(shr_4_gobble_next):
477 test %edx, %edx
478 jnz L(exit)
479
480 pmovmskb %xmm3, %edx
481 movdqa %xmm0, %xmm1
482 lea 32(%rdi), %rdi
483 lea 32(%rsi), %rsi
484 sub $0xffff, %edx
485 jnz L(exit)
486
487 lea 4(%rsi), %rsi
488 add %rcx, %rsi
489 add %rcx, %rdi
490 jmp L(less48bytes)
491
492# ifndef USE_AS_WMEMCMP
493
494 .p2align 4
495L(shr_5):
496 cmp $80, %rcx
497 lea -48(%rcx), %rcx
498 mov %edx, %eax
499 jae L(shr_5_gobble)
500
501 movdqa 16(%rsi), %xmm1
502 movdqa %xmm1, %xmm2
503 palignr $5, (%rsi), %xmm1
504 pcmpeqb (%rdi), %xmm1
505
506 movdqa 32(%rsi), %xmm3
507 palignr $5, %xmm2, %xmm3
508 pcmpeqb 16(%rdi), %xmm3
509
510 pand %xmm1, %xmm3
511 pmovmskb %xmm3, %edx
512 lea 32(%rdi), %rdi
513 lea 32(%rsi), %rsi
514 sub $0xffff, %edx
515 jnz L(exit)
516 add $5, %rsi
517 add %rcx, %rsi
518 add %rcx, %rdi
519 jmp L(less48bytes)
520
521 .p2align 4
522L(shr_5_gobble):
523 sub $32, %rcx
524 movdqa 16(%rsi), %xmm0
525 palignr $5, (%rsi), %xmm0
526 pcmpeqb (%rdi), %xmm0
527
528 movdqa 32(%rsi), %xmm3
529 palignr $5, 16(%rsi), %xmm3
530 pcmpeqb 16(%rdi), %xmm3
531
532L(shr_5_gobble_loop):
533 pand %xmm0, %xmm3
534 sub $32, %rcx
535 pmovmskb %xmm3, %edx
536 movdqa %xmm0, %xmm1
537
538 movdqa 64(%rsi), %xmm3
539 palignr $5, 48(%rsi), %xmm3
540 sbb $0xffff, %edx
541 movdqa 48(%rsi), %xmm0
542 palignr $5, 32(%rsi), %xmm0
543 pcmpeqb 32(%rdi), %xmm0
544 lea 32(%rsi), %rsi
545 pcmpeqb 48(%rdi), %xmm3
546
547 lea 32(%rdi), %rdi
548 jz L(shr_5_gobble_loop)
549 pand %xmm0, %xmm3
550
551 cmp $0, %rcx
552 jge L(shr_5_gobble_next)
553 inc %edx
554 add $32, %rcx
555L(shr_5_gobble_next):
556 test %edx, %edx
557 jnz L(exit)
558
559 pmovmskb %xmm3, %edx
560 movdqa %xmm0, %xmm1
561 lea 32(%rdi), %rdi
562 lea 32(%rsi), %rsi
563 sub $0xffff, %edx
564 jnz L(exit)
565
566 lea 5(%rsi), %rsi
567 add %rcx, %rsi
568 add %rcx, %rdi
569 jmp L(less48bytes)
570
571 .p2align 4
572L(shr_6):
573 cmp $80, %rcx
574 lea -48(%rcx), %rcx
575 mov %edx, %eax
576 jae L(shr_6_gobble)
577
578 movdqa 16(%rsi), %xmm1
579 movdqa %xmm1, %xmm2
580 palignr $6, (%rsi), %xmm1
581 pcmpeqb (%rdi), %xmm1
582
583 movdqa 32(%rsi), %xmm3
584 palignr $6, %xmm2, %xmm3
585 pcmpeqb 16(%rdi), %xmm3
586
587 pand %xmm1, %xmm3
588 pmovmskb %xmm3, %edx
589 lea 32(%rdi), %rdi
590 lea 32(%rsi), %rsi
591 sub $0xffff, %edx
592 jnz L(exit)
593 add $6, %rsi
594 add %rcx, %rsi
595 add %rcx, %rdi
596 jmp L(less48bytes)
597
598 .p2align 4
599L(shr_6_gobble):
600 sub $32, %rcx
601 movdqa 16(%rsi), %xmm0
602 palignr $6, (%rsi), %xmm0
603 pcmpeqb (%rdi), %xmm0
604
605 movdqa 32(%rsi), %xmm3
606 palignr $6, 16(%rsi), %xmm3
607 pcmpeqb 16(%rdi), %xmm3
608
609L(shr_6_gobble_loop):
610 pand %xmm0, %xmm3
611 sub $32, %rcx
612 pmovmskb %xmm3, %edx
613 movdqa %xmm0, %xmm1
614
615 movdqa 64(%rsi), %xmm3
616 palignr $6, 48(%rsi), %xmm3
617 sbb $0xffff, %edx
618 movdqa 48(%rsi), %xmm0
619 palignr $6, 32(%rsi), %xmm0
620 pcmpeqb 32(%rdi), %xmm0
621 lea 32(%rsi), %rsi
622 pcmpeqb 48(%rdi), %xmm3
623
624 lea 32(%rdi), %rdi
625 jz L(shr_6_gobble_loop)
626 pand %xmm0, %xmm3
627
628 cmp $0, %rcx
629 jge L(shr_6_gobble_next)
630 inc %edx
631 add $32, %rcx
632L(shr_6_gobble_next):
633 test %edx, %edx
634 jnz L(exit)
635
636 pmovmskb %xmm3, %edx
637 movdqa %xmm0, %xmm1
638 lea 32(%rdi), %rdi
639 lea 32(%rsi), %rsi
640 sub $0xffff, %edx
641 jnz L(exit)
642
643 lea 6(%rsi), %rsi
644 add %rcx, %rsi
645 add %rcx, %rdi
646 jmp L(less48bytes)
647
648 .p2align 4
649L(shr_7):
650 cmp $80, %rcx
651 lea -48(%rcx), %rcx
652 mov %edx, %eax
653 jae L(shr_7_gobble)
654
655 movdqa 16(%rsi), %xmm1
656 movdqa %xmm1, %xmm2
657 palignr $7, (%rsi), %xmm1
658 pcmpeqb (%rdi), %xmm1
659
660 movdqa 32(%rsi), %xmm3
661 palignr $7, %xmm2, %xmm3
662 pcmpeqb 16(%rdi), %xmm3
663
664 pand %xmm1, %xmm3
665 pmovmskb %xmm3, %edx
666 lea 32(%rdi), %rdi
667 lea 32(%rsi), %rsi
668 sub $0xffff, %edx
669 jnz L(exit)
670 add $7, %rsi
671 add %rcx, %rsi
672 add %rcx, %rdi
673 jmp L(less48bytes)
674
675 .p2align 4
676L(shr_7_gobble):
677 sub $32, %rcx
678 movdqa 16(%rsi), %xmm0
679 palignr $7, (%rsi), %xmm0
680 pcmpeqb (%rdi), %xmm0
681
682 movdqa 32(%rsi), %xmm3
683 palignr $7, 16(%rsi), %xmm3
684 pcmpeqb 16(%rdi), %xmm3
685
686L(shr_7_gobble_loop):
687 pand %xmm0, %xmm3
688 sub $32, %rcx
689 pmovmskb %xmm3, %edx
690 movdqa %xmm0, %xmm1
691
692 movdqa 64(%rsi), %xmm3
693 palignr $7, 48(%rsi), %xmm3
694 sbb $0xffff, %edx
695 movdqa 48(%rsi), %xmm0
696 palignr $7, 32(%rsi), %xmm0
697 pcmpeqb 32(%rdi), %xmm0
698 lea 32(%rsi), %rsi
699 pcmpeqb 48(%rdi), %xmm3
700
701 lea 32(%rdi), %rdi
702 jz L(shr_7_gobble_loop)
703 pand %xmm0, %xmm3
704
705 cmp $0, %rcx
706 jge L(shr_7_gobble_next)
707 inc %edx
708 add $32, %rcx
709L(shr_7_gobble_next):
710 test %edx, %edx
711 jnz L(exit)
712
713 pmovmskb %xmm3, %edx
714 movdqa %xmm0, %xmm1
715 lea 32(%rdi), %rdi
716 lea 32(%rsi), %rsi
717 sub $0xffff, %edx
718 jnz L(exit)
719
720 lea 7(%rsi), %rsi
721 add %rcx, %rsi
722 add %rcx, %rdi
723 jmp L(less48bytes)
724
725# endif
726
727 .p2align 4
728L(shr_8):
729 cmp $80, %rcx
730 lea -48(%rcx), %rcx
731 mov %edx, %eax
732 jae L(shr_8_gobble)
733
734 movdqa 16(%rsi), %xmm1
735 movdqa %xmm1, %xmm2
736 palignr $8, (%rsi), %xmm1
737 pcmpeqb (%rdi), %xmm1
738
739 movdqa 32(%rsi), %xmm3
740 palignr $8, %xmm2, %xmm3
741 pcmpeqb 16(%rdi), %xmm3
742
743 pand %xmm1, %xmm3
744 pmovmskb %xmm3, %edx
745 lea 32(%rdi), %rdi
746 lea 32(%rsi), %rsi
747 sub $0xffff, %edx
748 jnz L(exit)
749 add $8, %rsi
750 add %rcx, %rsi
751 add %rcx, %rdi
752 jmp L(less48bytes)
753
754 .p2align 4
755L(shr_8_gobble):
756 sub $32, %rcx
757 movdqa 16(%rsi), %xmm0
758 palignr $8, (%rsi), %xmm0
759 pcmpeqb (%rdi), %xmm0
760
761 movdqa 32(%rsi), %xmm3
762 palignr $8, 16(%rsi), %xmm3
763 pcmpeqb 16(%rdi), %xmm3
764
765L(shr_8_gobble_loop):
766 pand %xmm0, %xmm3
767 sub $32, %rcx
768 pmovmskb %xmm3, %edx
769 movdqa %xmm0, %xmm1
770
771 movdqa 64(%rsi), %xmm3
772 palignr $8, 48(%rsi), %xmm3
773 sbb $0xffff, %edx
774 movdqa 48(%rsi), %xmm0
775 palignr $8, 32(%rsi), %xmm0
776 pcmpeqb 32(%rdi), %xmm0
777 lea 32(%rsi), %rsi
778 pcmpeqb 48(%rdi), %xmm3
779
780 lea 32(%rdi), %rdi
781 jz L(shr_8_gobble_loop)
782 pand %xmm0, %xmm3
783
784 cmp $0, %rcx
785 jge L(shr_8_gobble_next)
786 inc %edx
787 add $32, %rcx
788L(shr_8_gobble_next):
789 test %edx, %edx
790 jnz L(exit)
791
792 pmovmskb %xmm3, %edx
793 movdqa %xmm0, %xmm1
794 lea 32(%rdi), %rdi
795 lea 32(%rsi), %rsi
796 sub $0xffff, %edx
797 jnz L(exit)
798
799 lea 8(%rsi), %rsi
800 add %rcx, %rsi
801 add %rcx, %rdi
802 jmp L(less48bytes)
803
804# ifndef USE_AS_WMEMCMP
805
806 .p2align 4
807L(shr_9):
808 cmp $80, %rcx
809 lea -48(%rcx), %rcx
810 mov %edx, %eax
811 jae L(shr_9_gobble)
812
813 movdqa 16(%rsi), %xmm1
814 movdqa %xmm1, %xmm2
815 palignr $9, (%rsi), %xmm1
816 pcmpeqb (%rdi), %xmm1
817
818 movdqa 32(%rsi), %xmm3
819 palignr $9, %xmm2, %xmm3
820 pcmpeqb 16(%rdi), %xmm3
821
822 pand %xmm1, %xmm3
823 pmovmskb %xmm3, %edx
824 lea 32(%rdi), %rdi
825 lea 32(%rsi), %rsi
826 sub $0xffff, %edx
827 jnz L(exit)
828 add $9, %rsi
829 add %rcx, %rsi
830 add %rcx, %rdi
831 jmp L(less48bytes)
832
833 .p2align 4
834L(shr_9_gobble):
835 sub $32, %rcx
836 movdqa 16(%rsi), %xmm0
837 palignr $9, (%rsi), %xmm0
838 pcmpeqb (%rdi), %xmm0
839
840 movdqa 32(%rsi), %xmm3
841 palignr $9, 16(%rsi), %xmm3
842 pcmpeqb 16(%rdi), %xmm3
843
844L(shr_9_gobble_loop):
845 pand %xmm0, %xmm3
846 sub $32, %rcx
847 pmovmskb %xmm3, %edx
848 movdqa %xmm0, %xmm1
849
850 movdqa 64(%rsi), %xmm3
851 palignr $9, 48(%rsi), %xmm3
852 sbb $0xffff, %edx
853 movdqa 48(%rsi), %xmm0
854 palignr $9, 32(%rsi), %xmm0
855 pcmpeqb 32(%rdi), %xmm0
856 lea 32(%rsi), %rsi
857 pcmpeqb 48(%rdi), %xmm3
858
859 lea 32(%rdi), %rdi
860 jz L(shr_9_gobble_loop)
861 pand %xmm0, %xmm3
862
863 cmp $0, %rcx
864 jge L(shr_9_gobble_next)
865 inc %edx
866 add $32, %rcx
867L(shr_9_gobble_next):
868 test %edx, %edx
869 jnz L(exit)
870
871 pmovmskb %xmm3, %edx
872 movdqa %xmm0, %xmm1
873 lea 32(%rdi), %rdi
874 lea 32(%rsi), %rsi
875 sub $0xffff, %edx
876 jnz L(exit)
877
878 lea 9(%rsi), %rsi
879 add %rcx, %rsi
880 add %rcx, %rdi
881 jmp L(less48bytes)
882
883 .p2align 4
884L(shr_10):
885 cmp $80, %rcx
886 lea -48(%rcx), %rcx
887 mov %edx, %eax
888 jae L(shr_10_gobble)
889
890 movdqa 16(%rsi), %xmm1
891 movdqa %xmm1, %xmm2
892 palignr $10, (%rsi), %xmm1
893 pcmpeqb (%rdi), %xmm1
894
895 movdqa 32(%rsi), %xmm3
896 palignr $10, %xmm2, %xmm3
897 pcmpeqb 16(%rdi), %xmm3
898
899 pand %xmm1, %xmm3
900 pmovmskb %xmm3, %edx
901 lea 32(%rdi), %rdi
902 lea 32(%rsi), %rsi
903 sub $0xffff, %edx
904 jnz L(exit)
905 add $10, %rsi
906 add %rcx, %rsi
907 add %rcx, %rdi
908 jmp L(less48bytes)
909
910 .p2align 4
911L(shr_10_gobble):
912 sub $32, %rcx
913 movdqa 16(%rsi), %xmm0
914 palignr $10, (%rsi), %xmm0
915 pcmpeqb (%rdi), %xmm0
916
917 movdqa 32(%rsi), %xmm3
918 palignr $10, 16(%rsi), %xmm3
919 pcmpeqb 16(%rdi), %xmm3
920
921L(shr_10_gobble_loop):
922 pand %xmm0, %xmm3
923 sub $32, %rcx
924 pmovmskb %xmm3, %edx
925 movdqa %xmm0, %xmm1
926
927 movdqa 64(%rsi), %xmm3
928 palignr $10, 48(%rsi), %xmm3
929 sbb $0xffff, %edx
930 movdqa 48(%rsi), %xmm0
931 palignr $10, 32(%rsi), %xmm0
932 pcmpeqb 32(%rdi), %xmm0
933 lea 32(%rsi), %rsi
934 pcmpeqb 48(%rdi), %xmm3
935
936 lea 32(%rdi), %rdi
937 jz L(shr_10_gobble_loop)
938 pand %xmm0, %xmm3
939
940 cmp $0, %rcx
941 jge L(shr_10_gobble_next)
942 inc %edx
943 add $32, %rcx
944L(shr_10_gobble_next):
945 test %edx, %edx
946 jnz L(exit)
947
948 pmovmskb %xmm3, %edx
949 movdqa %xmm0, %xmm1
950 lea 32(%rdi), %rdi
951 lea 32(%rsi), %rsi
952 sub $0xffff, %edx
953 jnz L(exit)
954
955 lea 10(%rsi), %rsi
956 add %rcx, %rsi
957 add %rcx, %rdi
958 jmp L(less48bytes)
959
960 .p2align 4
961L(shr_11):
962 cmp $80, %rcx
963 lea -48(%rcx), %rcx
964 mov %edx, %eax
965 jae L(shr_11_gobble)
966
967 movdqa 16(%rsi), %xmm1
968 movdqa %xmm1, %xmm2
969 palignr $11, (%rsi), %xmm1
970 pcmpeqb (%rdi), %xmm1
971
972 movdqa 32(%rsi), %xmm3
973 palignr $11, %xmm2, %xmm3
974 pcmpeqb 16(%rdi), %xmm3
975
976 pand %xmm1, %xmm3
977 pmovmskb %xmm3, %edx
978 lea 32(%rdi), %rdi
979 lea 32(%rsi), %rsi
980 sub $0xffff, %edx
981 jnz L(exit)
982 add $11, %rsi
983 add %rcx, %rsi
984 add %rcx, %rdi
985 jmp L(less48bytes)
986
987 .p2align 4
988L(shr_11_gobble):
989 sub $32, %rcx
990 movdqa 16(%rsi), %xmm0
991 palignr $11, (%rsi), %xmm0
992 pcmpeqb (%rdi), %xmm0
993
994 movdqa 32(%rsi), %xmm3
995 palignr $11, 16(%rsi), %xmm3
996 pcmpeqb 16(%rdi), %xmm3
997
998L(shr_11_gobble_loop):
999 pand %xmm0, %xmm3
1000 sub $32, %rcx
1001 pmovmskb %xmm3, %edx
1002 movdqa %xmm0, %xmm1
1003
1004 movdqa 64(%rsi), %xmm3
1005 palignr $11, 48(%rsi), %xmm3
1006 sbb $0xffff, %edx
1007 movdqa 48(%rsi), %xmm0
1008 palignr $11, 32(%rsi), %xmm0
1009 pcmpeqb 32(%rdi), %xmm0
1010 lea 32(%rsi), %rsi
1011 pcmpeqb 48(%rdi), %xmm3
1012
1013 lea 32(%rdi), %rdi
1014 jz L(shr_11_gobble_loop)
1015 pand %xmm0, %xmm3
1016
1017 cmp $0, %rcx
1018 jge L(shr_11_gobble_next)
1019 inc %edx
1020 add $32, %rcx
1021L(shr_11_gobble_next):
1022 test %edx, %edx
1023 jnz L(exit)
1024
1025 pmovmskb %xmm3, %edx
1026 movdqa %xmm0, %xmm1
1027 lea 32(%rdi), %rdi
1028 lea 32(%rsi), %rsi
1029 sub $0xffff, %edx
1030 jnz L(exit)
1031
1032 lea 11(%rsi), %rsi
1033 add %rcx, %rsi
1034 add %rcx, %rdi
1035 jmp L(less48bytes)
1036
1037# endif
1038
1039 .p2align 4
1040L(shr_12):
1041 cmp $80, %rcx
1042 lea -48(%rcx), %rcx
1043 mov %edx, %eax
1044 jae L(shr_12_gobble)
1045
1046 movdqa 16(%rsi), %xmm1
1047 movdqa %xmm1, %xmm2
1048 palignr $12, (%rsi), %xmm1
1049 pcmpeqb (%rdi), %xmm1
1050
1051 movdqa 32(%rsi), %xmm3
1052 palignr $12, %xmm2, %xmm3
1053 pcmpeqb 16(%rdi), %xmm3
1054
1055 pand %xmm1, %xmm3
1056 pmovmskb %xmm3, %edx
1057 lea 32(%rdi), %rdi
1058 lea 32(%rsi), %rsi
1059 sub $0xffff, %edx
1060 jnz L(exit)
1061 add $12, %rsi
1062 add %rcx, %rsi
1063 add %rcx, %rdi
1064 jmp L(less48bytes)
1065
1066 .p2align 4
1067L(shr_12_gobble):
1068 sub $32, %rcx
1069 movdqa 16(%rsi), %xmm0
1070 palignr $12, (%rsi), %xmm0
1071 pcmpeqb (%rdi), %xmm0
1072
1073 movdqa 32(%rsi), %xmm3
1074 palignr $12, 16(%rsi), %xmm3
1075 pcmpeqb 16(%rdi), %xmm3
1076
1077L(shr_12_gobble_loop):
1078 pand %xmm0, %xmm3
1079 sub $32, %rcx
1080 pmovmskb %xmm3, %edx
1081 movdqa %xmm0, %xmm1
1082
1083 movdqa 64(%rsi), %xmm3
1084 palignr $12, 48(%rsi), %xmm3
1085 sbb $0xffff, %edx
1086 movdqa 48(%rsi), %xmm0
1087 palignr $12, 32(%rsi), %xmm0
1088 pcmpeqb 32(%rdi), %xmm0
1089 lea 32(%rsi), %rsi
1090 pcmpeqb 48(%rdi), %xmm3
1091
1092 lea 32(%rdi), %rdi
1093 jz L(shr_12_gobble_loop)
1094 pand %xmm0, %xmm3
1095
1096 cmp $0, %rcx
1097 jge L(shr_12_gobble_next)
1098 inc %edx
1099 add $32, %rcx
1100L(shr_12_gobble_next):
1101 test %edx, %edx
1102 jnz L(exit)
1103
1104 pmovmskb %xmm3, %edx
1105 movdqa %xmm0, %xmm1
1106 lea 32(%rdi), %rdi
1107 lea 32(%rsi), %rsi
1108 sub $0xffff, %edx
1109 jnz L(exit)
1110
1111 lea 12(%rsi), %rsi
1112 add %rcx, %rsi
1113 add %rcx, %rdi
1114 jmp L(less48bytes)
1115
1116# ifndef USE_AS_WMEMCMP
1117
1118 .p2align 4
1119L(shr_13):
1120 cmp $80, %rcx
1121 lea -48(%rcx), %rcx
1122 mov %edx, %eax
1123 jae L(shr_13_gobble)
1124
1125 movdqa 16(%rsi), %xmm1
1126 movdqa %xmm1, %xmm2
1127 palignr $13, (%rsi), %xmm1
1128 pcmpeqb (%rdi), %xmm1
1129
1130 movdqa 32(%rsi), %xmm3
1131 palignr $13, %xmm2, %xmm3
1132 pcmpeqb 16(%rdi), %xmm3
1133
1134 pand %xmm1, %xmm3
1135 pmovmskb %xmm3, %edx
1136 lea 32(%rdi), %rdi
1137 lea 32(%rsi), %rsi
1138 sub $0xffff, %edx
1139 jnz L(exit)
1140 add $13, %rsi
1141 add %rcx, %rsi
1142 add %rcx, %rdi
1143 jmp L(less48bytes)
1144
1145 .p2align 4
1146L(shr_13_gobble):
1147 sub $32, %rcx
1148 movdqa 16(%rsi), %xmm0
1149 palignr $13, (%rsi), %xmm0
1150 pcmpeqb (%rdi), %xmm0
1151
1152 movdqa 32(%rsi), %xmm3
1153 palignr $13, 16(%rsi), %xmm3
1154 pcmpeqb 16(%rdi), %xmm3
1155
1156L(shr_13_gobble_loop):
1157 pand %xmm0, %xmm3
1158 sub $32, %rcx
1159 pmovmskb %xmm3, %edx
1160 movdqa %xmm0, %xmm1
1161
1162 movdqa 64(%rsi), %xmm3
1163 palignr $13, 48(%rsi), %xmm3
1164 sbb $0xffff, %edx
1165 movdqa 48(%rsi), %xmm0
1166 palignr $13, 32(%rsi), %xmm0
1167 pcmpeqb 32(%rdi), %xmm0
1168 lea 32(%rsi), %rsi
1169 pcmpeqb 48(%rdi), %xmm3
1170
1171 lea 32(%rdi), %rdi
1172 jz L(shr_13_gobble_loop)
1173 pand %xmm0, %xmm3
1174
1175 cmp $0, %rcx
1176 jge L(shr_13_gobble_next)
1177 inc %edx
1178 add $32, %rcx
1179L(shr_13_gobble_next):
1180 test %edx, %edx
1181 jnz L(exit)
1182
1183 pmovmskb %xmm3, %edx
1184 movdqa %xmm0, %xmm1
1185 lea 32(%rdi), %rdi
1186 lea 32(%rsi), %rsi
1187 sub $0xffff, %edx
1188 jnz L(exit)
1189
1190 lea 13(%rsi), %rsi
1191 add %rcx, %rsi
1192 add %rcx, %rdi
1193 jmp L(less48bytes)
1194
1195 .p2align 4
1196L(shr_14):
1197 cmp $80, %rcx
1198 lea -48(%rcx), %rcx
1199 mov %edx, %eax
1200 jae L(shr_14_gobble)
1201
1202 movdqa 16(%rsi), %xmm1
1203 movdqa %xmm1, %xmm2
1204 palignr $14, (%rsi), %xmm1
1205 pcmpeqb (%rdi), %xmm1
1206
1207 movdqa 32(%rsi), %xmm3
1208 palignr $14, %xmm2, %xmm3
1209 pcmpeqb 16(%rdi), %xmm3
1210
1211 pand %xmm1, %xmm3
1212 pmovmskb %xmm3, %edx
1213 lea 32(%rdi), %rdi
1214 lea 32(%rsi), %rsi
1215 sub $0xffff, %edx
1216 jnz L(exit)
1217 add $14, %rsi
1218 add %rcx, %rsi
1219 add %rcx, %rdi
1220 jmp L(less48bytes)
1221
1222 .p2align 4
1223L(shr_14_gobble):
1224 sub $32, %rcx
1225 movdqa 16(%rsi), %xmm0
1226 palignr $14, (%rsi), %xmm0
1227 pcmpeqb (%rdi), %xmm0
1228
1229 movdqa 32(%rsi), %xmm3
1230 palignr $14, 16(%rsi), %xmm3
1231 pcmpeqb 16(%rdi), %xmm3
1232
1233L(shr_14_gobble_loop):
1234 pand %xmm0, %xmm3
1235 sub $32, %rcx
1236 pmovmskb %xmm3, %edx
1237 movdqa %xmm0, %xmm1
1238
1239 movdqa 64(%rsi), %xmm3
1240 palignr $14, 48(%rsi), %xmm3
1241 sbb $0xffff, %edx
1242 movdqa 48(%rsi), %xmm0
1243 palignr $14, 32(%rsi), %xmm0
1244 pcmpeqb 32(%rdi), %xmm0
1245 lea 32(%rsi), %rsi
1246 pcmpeqb 48(%rdi), %xmm3
1247
1248 lea 32(%rdi), %rdi
1249 jz L(shr_14_gobble_loop)
1250 pand %xmm0, %xmm3
1251
1252 cmp $0, %rcx
1253 jge L(shr_14_gobble_next)
1254 inc %edx
1255 add $32, %rcx
1256L(shr_14_gobble_next):
1257 test %edx, %edx
1258 jnz L(exit)
1259
1260 pmovmskb %xmm3, %edx
1261 movdqa %xmm0, %xmm1
1262 lea 32(%rdi), %rdi
1263 lea 32(%rsi), %rsi
1264 sub $0xffff, %edx
1265 jnz L(exit)
1266
1267 lea 14(%rsi), %rsi
1268 add %rcx, %rsi
1269 add %rcx, %rdi
1270 jmp L(less48bytes)
1271
1272 .p2align 4
1273L(shr_15):
1274 cmp $80, %rcx
1275 lea -48(%rcx), %rcx
1276 mov %edx, %eax
1277 jae L(shr_15_gobble)
1278
1279 movdqa 16(%rsi), %xmm1
1280 movdqa %xmm1, %xmm2
1281 palignr $15, (%rsi), %xmm1
1282 pcmpeqb (%rdi), %xmm1
1283
1284 movdqa 32(%rsi), %xmm3
1285 palignr $15, %xmm2, %xmm3
1286 pcmpeqb 16(%rdi), %xmm3
1287
1288 pand %xmm1, %xmm3
1289 pmovmskb %xmm3, %edx
1290 lea 32(%rdi), %rdi
1291 lea 32(%rsi), %rsi
1292 sub $0xffff, %edx
1293 jnz L(exit)
1294 add $15, %rsi
1295 add %rcx, %rsi
1296 add %rcx, %rdi
1297 jmp L(less48bytes)
1298
1299 .p2align 4
1300L(shr_15_gobble):
1301 sub $32, %rcx
1302 movdqa 16(%rsi), %xmm0
1303 palignr $15, (%rsi), %xmm0
1304 pcmpeqb (%rdi), %xmm0
1305
1306 movdqa 32(%rsi), %xmm3
1307 palignr $15, 16(%rsi), %xmm3
1308 pcmpeqb 16(%rdi), %xmm3
1309
1310L(shr_15_gobble_loop):
1311 pand %xmm0, %xmm3
1312 sub $32, %rcx
1313 pmovmskb %xmm3, %edx
1314 movdqa %xmm0, %xmm1
1315
1316 movdqa 64(%rsi), %xmm3
1317 palignr $15, 48(%rsi), %xmm3
1318 sbb $0xffff, %edx
1319 movdqa 48(%rsi), %xmm0
1320 palignr $15, 32(%rsi), %xmm0
1321 pcmpeqb 32(%rdi), %xmm0
1322 lea 32(%rsi), %rsi
1323 pcmpeqb 48(%rdi), %xmm3
1324
1325 lea 32(%rdi), %rdi
1326 jz L(shr_15_gobble_loop)
1327 pand %xmm0, %xmm3
1328
1329 cmp $0, %rcx
1330 jge L(shr_15_gobble_next)
1331 inc %edx
1332 add $32, %rcx
1333L(shr_15_gobble_next):
1334 test %edx, %edx
1335 jnz L(exit)
1336
1337 pmovmskb %xmm3, %edx
1338 movdqa %xmm0, %xmm1
1339 lea 32(%rdi), %rdi
1340 lea 32(%rsi), %rsi
1341 sub $0xffff, %edx
1342 jnz L(exit)
1343
1344 lea 15(%rsi), %rsi
1345 add %rcx, %rsi
1346 add %rcx, %rdi
1347 jmp L(less48bytes)
1348# endif
1349 .p2align 4
1350L(exit):
1351 pmovmskb %xmm1, %r8d
1352 sub $0xffff, %r8d
1353 jz L(first16bytes)
1354 lea -16(%rsi), %rsi
1355 lea -16(%rdi), %rdi
1356 mov %r8d, %edx
1357L(first16bytes):
1358 add %rax, %rsi
1359L(less16bytes):
1360# ifndef USE_AS_WMEMCMP
1361 test %dl, %dl
1362 jz L(next_24_bytes)
1363
1364 test $0x01, %dl
1365 jnz L(Byte16)
1366
1367 test $0x02, %dl
1368 jnz L(Byte17)
1369
1370 test $0x04, %dl
1371 jnz L(Byte18)
1372
1373 test $0x08, %dl
1374 jnz L(Byte19)
1375
1376 test $0x10, %dl
1377 jnz L(Byte20)
1378
1379 test $0x20, %dl
1380 jnz L(Byte21)
1381
1382 test $0x40, %dl
1383 jnz L(Byte22)
1384
1385 movzbl -9(%rdi), %eax
1386 movzbl -9(%rsi), %edx
1387 sub %edx, %eax
1388 ret
1389
1390 .p2align 4
1391L(Byte16):
1392 movzbl -16(%rdi), %eax
1393 movzbl -16(%rsi), %edx
1394 sub %edx, %eax
1395 ret
1396
1397 .p2align 4
1398L(Byte17):
1399 movzbl -15(%rdi), %eax
1400 movzbl -15(%rsi), %edx
1401 sub %edx, %eax
1402 ret
1403
1404 .p2align 4
1405L(Byte18):
1406 movzbl -14(%rdi), %eax
1407 movzbl -14(%rsi), %edx
1408 sub %edx, %eax
1409 ret
1410
1411 .p2align 4
1412L(Byte19):
1413 movzbl -13(%rdi), %eax
1414 movzbl -13(%rsi), %edx
1415 sub %edx, %eax
1416 ret
1417
1418 .p2align 4
1419L(Byte20):
1420 movzbl -12(%rdi), %eax
1421 movzbl -12(%rsi), %edx
1422 sub %edx, %eax
1423 ret
1424
1425 .p2align 4
1426L(Byte21):
1427 movzbl -11(%rdi), %eax
1428 movzbl -11(%rsi), %edx
1429 sub %edx, %eax
1430 ret
1431
1432 .p2align 4
1433L(Byte22):
1434 movzbl -10(%rdi), %eax
1435 movzbl -10(%rsi), %edx
1436 sub %edx, %eax
1437 ret
1438
1439 .p2align 4
1440L(next_24_bytes):
1441 lea 8(%rdi), %rdi
1442 lea 8(%rsi), %rsi
1443 test $0x01, %dh
1444 jnz L(Byte16)
1445
1446 test $0x02, %dh
1447 jnz L(Byte17)
1448
1449 test $0x04, %dh
1450 jnz L(Byte18)
1451
1452 test $0x08, %dh
1453 jnz L(Byte19)
1454
1455 test $0x10, %dh
1456 jnz L(Byte20)
1457
1458 test $0x20, %dh
1459 jnz L(Byte21)
1460
1461 test $0x40, %dh
1462 jnz L(Byte22)
1463
1464 movzbl -9(%rdi), %eax
1465 movzbl -9(%rsi), %edx
1466 sub %edx, %eax
1467 ret
1468# else
1469/* special for wmemcmp */
1470 xor %eax, %eax
1471 test %dl, %dl
1472 jz L(next_two_double_words)
1473 and $15, %dl
1474 jz L(second_double_word)
1475 mov -16(%rdi), %eax
1476 cmp -16(%rsi), %eax
1477 jne L(find_diff)
1478 ret
1479
1480 .p2align 4
1481L(second_double_word):
1482 mov -12(%rdi), %eax
1483 cmp -12(%rsi), %eax
1484 jne L(find_diff)
1485 ret
1486
1487 .p2align 4
1488L(next_two_double_words):
1489 and $15, %dh
1490 jz L(fourth_double_word)
1491 mov -8(%rdi), %eax
1492 cmp -8(%rsi), %eax
1493 jne L(find_diff)
1494 ret
1495
1496 .p2align 4
1497L(fourth_double_word):
1498 mov -4(%rdi), %eax
1499 cmp -4(%rsi), %eax
1500 jne L(find_diff)
1501 ret
1502# endif
1503
1504 .p2align 4
1505L(less48bytes):
1506 cmp $8, %ecx
1507 jae L(more8bytes)
1508 cmp $0, %ecx
1509 je L(0bytes)
1510# ifndef USE_AS_WMEMCMP
1511 cmp $1, %ecx
1512 je L(1bytes)
1513 cmp $2, %ecx
1514 je L(2bytes)
1515 cmp $3, %ecx
1516 je L(3bytes)
1517 cmp $4, %ecx
1518 je L(4bytes)
1519 cmp $5, %ecx
1520 je L(5bytes)
1521 cmp $6, %ecx
1522 je L(6bytes)
1523 jmp L(7bytes)
1524# else
1525 jmp L(4bytes)
1526# endif
1527
1528 .p2align 4
1529L(more8bytes):
1530 cmp $16, %ecx
1531 jae L(more16bytes)
1532 cmp $8, %ecx
1533 je L(8bytes)
1534# ifndef USE_AS_WMEMCMP
1535 cmp $9, %ecx
1536 je L(9bytes)
1537 cmp $10, %ecx
1538 je L(10bytes)
1539 cmp $11, %ecx
1540 je L(11bytes)
1541 cmp $12, %ecx
1542 je L(12bytes)
1543 cmp $13, %ecx
1544 je L(13bytes)
1545 cmp $14, %ecx
1546 je L(14bytes)
1547 jmp L(15bytes)
1548# else
1549 jmp L(12bytes)
1550# endif
1551
1552 .p2align 4
1553L(more16bytes):
1554 cmp $24, %ecx
1555 jae L(more24bytes)
1556 cmp $16, %ecx
1557 je L(16bytes)
1558# ifndef USE_AS_WMEMCMP
1559 cmp $17, %ecx
1560 je L(17bytes)
1561 cmp $18, %ecx
1562 je L(18bytes)
1563 cmp $19, %ecx
1564 je L(19bytes)
1565 cmp $20, %ecx
1566 je L(20bytes)
1567 cmp $21, %ecx
1568 je L(21bytes)
1569 cmp $22, %ecx
1570 je L(22bytes)
1571 jmp L(23bytes)
1572# else
1573 jmp L(20bytes)
1574# endif
1575
1576 .p2align 4
1577L(more24bytes):
1578 cmp $32, %ecx
1579 jae L(more32bytes)
1580 cmp $24, %ecx
1581 je L(24bytes)
1582# ifndef USE_AS_WMEMCMP
1583 cmp $25, %ecx
1584 je L(25bytes)
1585 cmp $26, %ecx
1586 je L(26bytes)
1587 cmp $27, %ecx
1588 je L(27bytes)
1589 cmp $28, %ecx
1590 je L(28bytes)
1591 cmp $29, %ecx
1592 je L(29bytes)
1593 cmp $30, %ecx
1594 je L(30bytes)
1595 jmp L(31bytes)
1596# else
1597 jmp L(28bytes)
1598# endif
1599
1600 .p2align 4
1601L(more32bytes):
1602 cmp $40, %ecx
1603 jae L(more40bytes)
1604 cmp $32, %ecx
1605 je L(32bytes)
1606# ifndef USE_AS_WMEMCMP
1607 cmp $33, %ecx
1608 je L(33bytes)
1609 cmp $34, %ecx
1610 je L(34bytes)
1611 cmp $35, %ecx
1612 je L(35bytes)
1613 cmp $36, %ecx
1614 je L(36bytes)
1615 cmp $37, %ecx
1616 je L(37bytes)
1617 cmp $38, %ecx
1618 je L(38bytes)
1619 jmp L(39bytes)
1620# else
1621 jmp L(36bytes)
1622# endif
1623
1624 .p2align 4
1625L(more40bytes):
1626 cmp $40, %ecx
1627 je L(40bytes)
1628# ifndef USE_AS_WMEMCMP
1629 cmp $41, %ecx
1630 je L(41bytes)
1631 cmp $42, %ecx
1632 je L(42bytes)
1633 cmp $43, %ecx
1634 je L(43bytes)
1635 cmp $44, %ecx
1636 je L(44bytes)
1637 cmp $45, %ecx
1638 je L(45bytes)
1639 cmp $46, %ecx
1640 je L(46bytes)
1641 jmp L(47bytes)
1642
1643 .p2align 4
1644L(44bytes):
1645 movl -44(%rdi), %eax
1646 movl -44(%rsi), %ecx
1647 cmp %ecx, %eax
1648 jne L(find_diff)
1649L(40bytes):
1650 movl -40(%rdi), %eax
1651 movl -40(%rsi), %ecx
1652 cmp %ecx, %eax
1653 jne L(find_diff)
1654L(36bytes):
1655 movl -36(%rdi), %eax
1656 movl -36(%rsi), %ecx
1657 cmp %ecx, %eax
1658 jne L(find_diff)
1659L(32bytes):
1660 movl -32(%rdi), %eax
1661 movl -32(%rsi), %ecx
1662 cmp %ecx, %eax
1663 jne L(find_diff)
1664L(28bytes):
1665 movl -28(%rdi), %eax
1666 movl -28(%rsi), %ecx
1667 cmp %ecx, %eax
1668 jne L(find_diff)
1669L(24bytes):
1670 movl -24(%rdi), %eax
1671 movl -24(%rsi), %ecx
1672 cmp %ecx, %eax
1673 jne L(find_diff)
1674L(20bytes):
1675 movl -20(%rdi), %eax
1676 movl -20(%rsi), %ecx
1677 cmp %ecx, %eax
1678 jne L(find_diff)
1679L(16bytes):
1680 movl -16(%rdi), %eax
1681 movl -16(%rsi), %ecx
1682 cmp %ecx, %eax
1683 jne L(find_diff)
1684L(12bytes):
1685 movl -12(%rdi), %eax
1686 movl -12(%rsi), %ecx
1687 cmp %ecx, %eax
1688 jne L(find_diff)
1689L(8bytes):
1690 movl -8(%rdi), %eax
1691 movl -8(%rsi), %ecx
1692 cmp %ecx, %eax
1693 jne L(find_diff)
1694L(4bytes):
1695 movl -4(%rdi), %eax
1696 movl -4(%rsi), %ecx
1697 cmp %ecx, %eax
1698 jne L(find_diff)
1699L(0bytes):
1700 xor %eax, %eax
1701 ret
1702# else
1703 .p2align 4
1704L(44bytes):
1705 movl -44(%rdi), %eax
1706 cmp -44(%rsi), %eax
1707 jne L(find_diff)
1708L(40bytes):
1709 movl -40(%rdi), %eax
1710 cmp -40(%rsi), %eax
1711 jne L(find_diff)
1712L(36bytes):
1713 movl -36(%rdi), %eax
1714 cmp -36(%rsi), %eax
1715 jne L(find_diff)
1716L(32bytes):
1717 movl -32(%rdi), %eax
1718 cmp -32(%rsi), %eax
1719 jne L(find_diff)
1720L(28bytes):
1721 movl -28(%rdi), %eax
1722 cmp -28(%rsi), %eax
1723 jne L(find_diff)
1724L(24bytes):
1725 movl -24(%rdi), %eax
1726 cmp -24(%rsi), %eax
1727 jne L(find_diff)
1728L(20bytes):
1729 movl -20(%rdi), %eax
1730 cmp -20(%rsi), %eax
1731 jne L(find_diff)
1732L(16bytes):
1733 movl -16(%rdi), %eax
1734 cmp -16(%rsi), %eax
1735 jne L(find_diff)
1736L(12bytes):
1737 movl -12(%rdi), %eax
1738 cmp -12(%rsi), %eax
1739 jne L(find_diff)
1740L(8bytes):
1741 movl -8(%rdi), %eax
1742 cmp -8(%rsi), %eax
1743 jne L(find_diff)
1744L(4bytes):
1745 movl -4(%rdi), %eax
1746 cmp -4(%rsi), %eax
1747 jne L(find_diff)
1748L(0bytes):
1749 xor %eax, %eax
1750 ret
1751# endif
1752
1753# ifndef USE_AS_WMEMCMP
1754 .p2align 4
1755L(45bytes):
1756 movl -45(%rdi), %eax
1757 movl -45(%rsi), %ecx
1758 cmp %ecx, %eax
1759 jne L(find_diff)
1760L(41bytes):
1761 movl -41(%rdi), %eax
1762 movl -41(%rsi), %ecx
1763 cmp %ecx, %eax
1764 jne L(find_diff)
1765L(37bytes):
1766 movl -37(%rdi), %eax
1767 movl -37(%rsi), %ecx
1768 cmp %ecx, %eax
1769 jne L(find_diff)
1770L(33bytes):
1771 movl -33(%rdi), %eax
1772 movl -33(%rsi), %ecx
1773 cmp %ecx, %eax
1774 jne L(find_diff)
1775L(29bytes):
1776 movl -29(%rdi), %eax
1777 movl -29(%rsi), %ecx
1778 cmp %ecx, %eax
1779 jne L(find_diff)
1780L(25bytes):
1781 movl -25(%rdi), %eax
1782 movl -25(%rsi), %ecx
1783 cmp %ecx, %eax
1784 jne L(find_diff)
1785L(21bytes):
1786 movl -21(%rdi), %eax
1787 movl -21(%rsi), %ecx
1788 cmp %ecx, %eax
1789 jne L(find_diff)
1790L(17bytes):
1791 movl -17(%rdi), %eax
1792 movl -17(%rsi), %ecx
1793 cmp %ecx, %eax
1794 jne L(find_diff)
1795L(13bytes):
1796 movl -13(%rdi), %eax
1797 movl -13(%rsi), %ecx
1798 cmp %ecx, %eax
1799 jne L(find_diff)
1800L(9bytes):
1801 movl -9(%rdi), %eax
1802 movl -9(%rsi), %ecx
1803 cmp %ecx, %eax
1804 jne L(find_diff)
1805L(5bytes):
1806 movl -5(%rdi), %eax
1807 movl -5(%rsi), %ecx
1808 cmp %ecx, %eax
1809 jne L(find_diff)
1810L(1bytes):
1811 movzbl -1(%rdi), %eax
1812 cmpb -1(%rsi), %al
1813 jne L(set)
1814 xor %eax, %eax
1815 ret
1816
1817 .p2align 4
1818L(46bytes):
1819 movl -46(%rdi), %eax
1820 movl -46(%rsi), %ecx
1821 cmp %ecx, %eax
1822 jne L(find_diff)
1823L(42bytes):
1824 movl -42(%rdi), %eax
1825 movl -42(%rsi), %ecx
1826 cmp %ecx, %eax
1827 jne L(find_diff)
1828L(38bytes):
1829 movl -38(%rdi), %eax
1830 movl -38(%rsi), %ecx
1831 cmp %ecx, %eax
1832 jne L(find_diff)
1833L(34bytes):
1834 movl -34(%rdi), %eax
1835 movl -34(%rsi), %ecx
1836 cmp %ecx, %eax
1837 jne L(find_diff)
1838L(30bytes):
1839 movl -30(%rdi), %eax
1840 movl -30(%rsi), %ecx
1841 cmp %ecx, %eax
1842 jne L(find_diff)
1843L(26bytes):
1844 movl -26(%rdi), %eax
1845 movl -26(%rsi), %ecx
1846 cmp %ecx, %eax
1847 jne L(find_diff)
1848L(22bytes):
1849 movl -22(%rdi), %eax
1850 movl -22(%rsi), %ecx
1851 cmp %ecx, %eax
1852 jne L(find_diff)
1853L(18bytes):
1854 movl -18(%rdi), %eax
1855 movl -18(%rsi), %ecx
1856 cmp %ecx, %eax
1857 jne L(find_diff)
1858L(14bytes):
1859 movl -14(%rdi), %eax
1860 movl -14(%rsi), %ecx
1861 cmp %ecx, %eax
1862 jne L(find_diff)
1863L(10bytes):
1864 movl -10(%rdi), %eax
1865 movl -10(%rsi), %ecx
1866 cmp %ecx, %eax
1867 jne L(find_diff)
1868L(6bytes):
1869 movl -6(%rdi), %eax
1870 movl -6(%rsi), %ecx
1871 cmp %ecx, %eax
1872 jne L(find_diff)
1873L(2bytes):
1874 movzwl -2(%rdi), %eax
1875 movzwl -2(%rsi), %ecx
1876 cmpb %cl, %al
1877 jne L(set)
1878 cmp %ecx, %eax
1879 jne L(set)
1880 xor %eax, %eax
1881 ret
1882
1883 .p2align 4
1884L(47bytes):
1885 movl -47(%rdi), %eax
1886 movl -47(%rsi), %ecx
1887 cmp %ecx, %eax
1888 jne L(find_diff)
1889L(43bytes):
1890 movl -43(%rdi), %eax
1891 movl -43(%rsi), %ecx
1892 cmp %ecx, %eax
1893 jne L(find_diff)
1894L(39bytes):
1895 movl -39(%rdi), %eax
1896 movl -39(%rsi), %ecx
1897 cmp %ecx, %eax
1898 jne L(find_diff)
1899L(35bytes):
1900 movl -35(%rdi), %eax
1901 movl -35(%rsi), %ecx
1902 cmp %ecx, %eax
1903 jne L(find_diff)
1904L(31bytes):
1905 movl -31(%rdi), %eax
1906 movl -31(%rsi), %ecx
1907 cmp %ecx, %eax
1908 jne L(find_diff)
1909L(27bytes):
1910 movl -27(%rdi), %eax
1911 movl -27(%rsi), %ecx
1912 cmp %ecx, %eax
1913 jne L(find_diff)
1914L(23bytes):
1915 movl -23(%rdi), %eax
1916 movl -23(%rsi), %ecx
1917 cmp %ecx, %eax
1918 jne L(find_diff)
1919L(19bytes):
1920 movl -19(%rdi), %eax
1921 movl -19(%rsi), %ecx
1922 cmp %ecx, %eax
1923 jne L(find_diff)
1924L(15bytes):
1925 movl -15(%rdi), %eax
1926 movl -15(%rsi), %ecx
1927 cmp %ecx, %eax
1928 jne L(find_diff)
1929L(11bytes):
1930 movl -11(%rdi), %eax
1931 movl -11(%rsi), %ecx
1932 cmp %ecx, %eax
1933 jne L(find_diff)
1934L(7bytes):
1935 movl -7(%rdi), %eax
1936 movl -7(%rsi), %ecx
1937 cmp %ecx, %eax
1938 jne L(find_diff)
1939L(3bytes):
1940 movzwl -3(%rdi), %eax
1941 movzwl -3(%rsi), %ecx
1942 cmpb %cl, %al
1943 jne L(set)
1944 cmp %ecx, %eax
1945 jne L(set)
1946 movzbl -1(%rdi), %eax
1947 cmpb -1(%rsi), %al
1948 jne L(set)
1949 xor %eax, %eax
1950 ret
1951
1952 .p2align 4
1953L(find_diff):
1954 cmpb %cl, %al
1955 jne L(set)
1956 cmpw %cx, %ax
1957 jne L(set)
1958 shr $16, %eax
1959 shr $16, %ecx
1960 cmpb %cl, %al
1961 jne L(set)
1962
1963/* We get there only if we already know there is a
1964difference. */
1965
1966 cmp %ecx, %eax
1967L(set):
1968 sbb %eax, %eax
1969 sbb $-1, %eax
1970 ret
1971# else
1972
1973/* for wmemcmp */
1974 .p2align 4
1975L(find_diff):
1976 mov $1, %eax
1977 jg L(find_diff_bigger)
1978 neg %eax
1979 ret
1980
1981 .p2align 4
1982L(find_diff_bigger):
1983 ret
1984# endif
1985
1986 .p2align 4
1987L(equal):
1988 xor %eax, %eax
1989 ret
1990
1991END (MEMCMP)
1992#endif
1993

source code of glibc/sysdeps/x86_64/multiarch/memcmp-ssse3.S