1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2 Copyright (C) 2018-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# if defined USE_AS_STRCASECMP_L
24# include "locale-defines.h"
25# endif
26
27# ifndef STRCMP
28# define STRCMP __strcmp_avx2
29# endif
30
31# define PAGE_SIZE 4096
32
33 /* VEC_SIZE = Number of bytes in a ymm register. */
34# define VEC_SIZE 32
35
36# define VMOVU vmovdqu
37# define VMOVA vmovdqa
38
39# ifdef USE_AS_WCSCMP
40 /* Compare packed dwords. */
41# define VPCMPEQ vpcmpeqd
42 /* Compare packed dwords and store minimum. */
43# define VPMINU vpminud
44 /* 1 dword char == 4 bytes. */
45# define SIZE_OF_CHAR 4
46# else
47 /* Compare packed bytes. */
48# define VPCMPEQ vpcmpeqb
49 /* Compare packed bytes and store minimum. */
50# define VPMINU vpminub
51 /* 1 byte char == 1 byte. */
52# define SIZE_OF_CHAR 1
53# endif
54
55# ifdef USE_AS_STRNCMP
56# define LOOP_REG r9d
57# define LOOP_REG64 r9
58
59# define OFFSET_REG8 r9b
60# define OFFSET_REG r9d
61# define OFFSET_REG64 r9
62# else
63# define LOOP_REG edx
64# define LOOP_REG64 rdx
65
66# define OFFSET_REG8 dl
67# define OFFSET_REG edx
68# define OFFSET_REG64 rdx
69# endif
70
71# ifndef VZEROUPPER
72# define VZEROUPPER vzeroupper
73# endif
74
75# if defined USE_AS_STRNCMP
76# define VEC_OFFSET 0
77# else
78# define VEC_OFFSET (-VEC_SIZE)
79# endif
80
81# ifdef USE_AS_STRCASECMP_L
82# define BYTE_LOOP_REG OFFSET_REG
83# else
84# define BYTE_LOOP_REG ecx
85# endif
86
87# ifdef USE_AS_STRCASECMP_L
88# ifdef USE_AS_STRNCMP
89# define STRCASECMP __strncasecmp_avx2
90# define LOCALE_REG rcx
91# define LOCALE_REG_LP RCX_LP
92# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
93# else
94# define STRCASECMP __strcasecmp_avx2
95# define LOCALE_REG rdx
96# define LOCALE_REG_LP RDX_LP
97# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
98# endif
99# endif
100
101# define xmmZERO xmm15
102# define ymmZERO ymm15
103
104# define LCASE_MIN_ymm %ymm10
105# define LCASE_MAX_ymm %ymm11
106# define CASE_ADD_ymm %ymm12
107
108# define LCASE_MIN_xmm %xmm10
109# define LCASE_MAX_xmm %xmm11
110# define CASE_ADD_xmm %xmm12
111
112 /* r11 is never use elsewhere so this is safe to maintain. */
113# define TOLOWER_BASE %r11
114
115# ifndef SECTION
116# define SECTION(p) p##.avx
117# endif
118
119# ifdef USE_AS_STRCASECMP_L
120# define REG(x, y) x ## y
121# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
122 vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
123 vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
124 vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
125 vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
126 vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
127 vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
128 vpaddb REG(%ext, 8), reg1_in, reg1_out; \
129 vpaddb REG(%ext, 9), reg2_in, reg2_out
130
131# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
132# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
133# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
134
135# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
136 TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
137 VPCMPEQ scratch_reg, s2_reg, reg_out
138
139# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
140 VMOVU s2_mem, reg_out; \
141 CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
142
143# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
144# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
145
146# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
147# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
148
149# else
150# define TOLOWER_gpr(...)
151# define TOLOWER_ymm(...)
152# define TOLOWER_xmm(...)
153
154# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
155 VPCMPEQ s2_reg, s1_reg, reg_out
156
157# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
158
159# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
160# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
161# endif
162
163/* Warning!
164 wcscmp/wcsncmp have to use SIGNED comparison for elements.
165 strcmp/strncmp have to use UNSIGNED comparison for elements.
166*/
167
168/* The main idea of the string comparison (byte or dword) using AVX2
169 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
170 either packed bytes or dwords depending on USE_AS_WCSCMP. In order
171 to check the null char, algorithm keeps the matched bytes/dwords,
172 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
173 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
174 one VPMINU instructions, together with movdqu and testl instructions.
175 Main loop (away from from page boundary) compares 4 vectors are a time,
176 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
177
178 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
179 is the same as strcmp, except that an a maximum offset is tracked. If
180 the maximum offset is reached before a difference is found, zero is
181 returned. */
182
183 .section SECTION(.text), "ax", @progbits
184 .align 16
185 .type STRCMP, @function
186 .globl STRCMP
187 .hidden STRCMP
188
189# ifndef GLABEL
190# define GLABEL(...) __VA_ARGS__
191# endif
192
193# ifdef USE_AS_STRCASECMP_L
194ENTRY (GLABEL(STRCASECMP))
195 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
196 mov %fs:(%rax), %LOCALE_REG_LP
197
198 /* Either 1 or 5 bytes (dependeing if CET is enabled). */
199 .p2align 4
200END (GLABEL(STRCASECMP))
201 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
202# endif
203
204 .p2align 4
205STRCMP:
206 cfi_startproc
207 _CET_ENDBR
208 CALL_MCOUNT
209
210# if defined USE_AS_STRCASECMP_L
211 /* We have to fall back on the C implementation for locales with
212 encodings not matching ASCII for single bytes. */
213# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
214 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
215# else
216 mov (%LOCALE_REG), %RAX_LP
217# endif
218 testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
219 jne STRCASECMP_NONASCII
220 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
221# endif
222
223# ifdef USE_AS_STRNCMP
224 /* Don't overwrite LOCALE_REG (rcx) until we have pass
225 L(one_or_less). Otherwise we might use the wrong locale in
226 the OVERFLOW_STRCMP (strcasecmp_l). */
227# ifdef __ILP32__
228 /* Clear the upper 32 bits. */
229 movl %edx, %edx
230# endif
231 cmp $1, %RDX_LP
232 /* Signed comparison intentional. We use this branch to also
233 test cases where length >= 2^63. These very large sizes can be
234 handled with strcmp as there is no way for that length to
235 actually bound the buffer. */
236 jle L(one_or_less)
237# ifdef USE_AS_WCSCMP
238 movq %rdx, %rcx
239
240 /* Multiplying length by sizeof(wchar_t) can result in overflow.
241 Check if that is possible. All cases where overflow are possible
242 are cases where length is large enough that it can never be a
243 bound on valid memory so just use wcscmp. */
244 shrq $56, %rcx
245 jnz OVERFLOW_STRCMP
246
247 leaq (, %rdx, 4), %rdx
248# endif
249# endif
250 vpxor %xmmZERO, %xmmZERO, %xmmZERO
251# if defined USE_AS_STRCASECMP_L
252 .section .rodata.cst32, "aM", @progbits, 32
253 .align 32
254L(lcase_min):
255 .quad 0x3f3f3f3f3f3f3f3f
256 .quad 0x3f3f3f3f3f3f3f3f
257 .quad 0x3f3f3f3f3f3f3f3f
258 .quad 0x3f3f3f3f3f3f3f3f
259L(lcase_max):
260 .quad 0x9999999999999999
261 .quad 0x9999999999999999
262 .quad 0x9999999999999999
263 .quad 0x9999999999999999
264L(case_add):
265 .quad 0x2020202020202020
266 .quad 0x2020202020202020
267 .quad 0x2020202020202020
268 .quad 0x2020202020202020
269 .previous
270
271 vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
272 vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
273 vmovdqa L(case_add)(%rip), CASE_ADD_ymm
274# endif
275 movl %edi, %eax
276 orl %esi, %eax
277 sall $20, %eax
278 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
279 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
280 ja L(page_cross)
281
282L(no_page_cross):
283 /* Safe to compare 4x vectors. */
284 VMOVU (%rdi), %ymm0
285 /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
286 Otherwise converts ymm0 and load from rsi to lower. ymm2 is
287 scratch and ymm1 is the return. */
288 CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
289 /* 1s at null CHAR. */
290 VPCMPEQ %ymm0, %ymmZERO, %ymm2
291 /* 1s where s1 and s2 equal AND not null CHAR. */
292 vpandn %ymm1, %ymm2, %ymm1
293
294 /* All 1s -> keep going, any 0s -> return. */
295 vpmovmskb %ymm1, %ecx
296# ifdef USE_AS_STRNCMP
297 cmpq $VEC_SIZE, %rdx
298 jbe L(vec_0_test_len)
299# endif
300
301 /* All 1s represents all equals. incl will overflow to zero in
302 all equals case. Otherwise 1s will carry until position of first
303 mismatch. */
304 incl %ecx
305 jz L(more_3x_vec)
306
307 .p2align 4,, 4
308L(return_vec_0):
309 tzcntl %ecx, %ecx
310# ifdef USE_AS_WCSCMP
311 movl (%rdi, %rcx), %edx
312 xorl %eax, %eax
313 cmpl (%rsi, %rcx), %edx
314 je L(ret0)
315 setl %al
316 negl %eax
317 orl $1, %eax
318# else
319 movzbl (%rdi, %rcx), %eax
320 movzbl (%rsi, %rcx), %ecx
321 TOLOWER_gpr (%rax, %eax)
322 TOLOWER_gpr (%rcx, %ecx)
323 subl %ecx, %eax
324# endif
325L(ret0):
326L(return_vzeroupper):
327 ZERO_UPPER_VEC_REGISTERS_RETURN
328
329# ifdef USE_AS_STRNCMP
330 .p2align 4,, 8
331L(vec_0_test_len):
332 notl %ecx
333 bzhil %edx, %ecx, %eax
334 jnz L(return_vec_0)
335 /* Align if will cross fetch block. */
336 .p2align 4,, 2
337L(ret_zero):
338 xorl %eax, %eax
339 VZEROUPPER_RETURN
340
341 .p2align 4,, 5
342L(one_or_less):
343# ifdef USE_AS_STRCASECMP_L
344 /* Set locale argument for strcasecmp. */
345 movq %LOCALE_REG, %rdx
346# endif
347 jb L(ret_zero)
348 /* 'nbe' covers the case where length is negative (large
349 unsigned). */
350 jnbe OVERFLOW_STRCMP
351# ifdef USE_AS_WCSCMP
352 movl (%rdi), %edx
353 xorl %eax, %eax
354 cmpl (%rsi), %edx
355 je L(ret1)
356 setl %al
357 negl %eax
358 orl $1, %eax
359# else
360 movzbl (%rdi), %eax
361 movzbl (%rsi), %ecx
362 TOLOWER_gpr (%rax, %eax)
363 TOLOWER_gpr (%rcx, %ecx)
364 subl %ecx, %eax
365# endif
366L(ret1):
367 ret
368# endif
369
370 .p2align 4,, 10
371L(return_vec_1):
372 tzcntl %ecx, %ecx
373# ifdef USE_AS_STRNCMP
374 /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
375 overflow. */
376 addq $-VEC_SIZE, %rdx
377 cmpq %rcx, %rdx
378 jbe L(ret_zero)
379# endif
380# ifdef USE_AS_WCSCMP
381 movl VEC_SIZE(%rdi, %rcx), %edx
382 xorl %eax, %eax
383 cmpl VEC_SIZE(%rsi, %rcx), %edx
384 je L(ret2)
385 setl %al
386 negl %eax
387 orl $1, %eax
388# else
389 movzbl VEC_SIZE(%rdi, %rcx), %eax
390 movzbl VEC_SIZE(%rsi, %rcx), %ecx
391 TOLOWER_gpr (%rax, %eax)
392 TOLOWER_gpr (%rcx, %ecx)
393 subl %ecx, %eax
394# endif
395L(ret2):
396 VZEROUPPER_RETURN
397
398 .p2align 4,, 10
399# ifdef USE_AS_STRNCMP
400L(return_vec_3):
401 salq $32, %rcx
402# endif
403
404L(return_vec_2):
405# ifndef USE_AS_STRNCMP
406 tzcntl %ecx, %ecx
407# else
408 tzcntq %rcx, %rcx
409 cmpq %rcx, %rdx
410 jbe L(ret_zero)
411# endif
412
413# ifdef USE_AS_WCSCMP
414 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
415 xorl %eax, %eax
416 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
417 je L(ret3)
418 setl %al
419 negl %eax
420 orl $1, %eax
421# else
422 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
423 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
424 TOLOWER_gpr (%rax, %eax)
425 TOLOWER_gpr (%rcx, %ecx)
426 subl %ecx, %eax
427# endif
428L(ret3):
429 VZEROUPPER_RETURN
430
431# ifndef USE_AS_STRNCMP
432 .p2align 4,, 10
433L(return_vec_3):
434 tzcntl %ecx, %ecx
435# ifdef USE_AS_WCSCMP
436 movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
437 xorl %eax, %eax
438 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
439 je L(ret4)
440 setl %al
441 negl %eax
442 orl $1, %eax
443# else
444 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
445 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
446 TOLOWER_gpr (%rax, %eax)
447 TOLOWER_gpr (%rcx, %ecx)
448 subl %ecx, %eax
449# endif
450L(ret4):
451 VZEROUPPER_RETURN
452# endif
453
454 .p2align 4,, 10
455L(more_3x_vec):
456 /* Safe to compare 4x vectors. */
457 VMOVU VEC_SIZE(%rdi), %ymm0
458 CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
459 VPCMPEQ %ymm0, %ymmZERO, %ymm2
460 vpandn %ymm1, %ymm2, %ymm1
461 vpmovmskb %ymm1, %ecx
462 incl %ecx
463 jnz L(return_vec_1)
464
465# ifdef USE_AS_STRNCMP
466 subq $(VEC_SIZE * 2), %rdx
467 jbe L(ret_zero)
468# endif
469
470 VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
471 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
472 VPCMPEQ %ymm0, %ymmZERO, %ymm2
473 vpandn %ymm1, %ymm2, %ymm1
474 vpmovmskb %ymm1, %ecx
475 incl %ecx
476 jnz L(return_vec_2)
477
478 VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
479 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
480 VPCMPEQ %ymm0, %ymmZERO, %ymm2
481 vpandn %ymm1, %ymm2, %ymm1
482 vpmovmskb %ymm1, %ecx
483 incl %ecx
484 jnz L(return_vec_3)
485
486# ifdef USE_AS_STRNCMP
487 cmpq $(VEC_SIZE * 2), %rdx
488 jbe L(ret_zero)
489# endif
490
491# ifdef USE_AS_WCSCMP
492 /* any non-zero positive value that doesn't inference with 0x1.
493 */
494 movl $2, %r8d
495
496# else
497 xorl %r8d, %r8d
498# endif
499
500 /* The prepare labels are various entry points from the page
501 cross logic. */
502L(prepare_loop):
503
504# ifdef USE_AS_STRNCMP
505 /* Store N + (VEC_SIZE * 4) and place check at the begining of
506 the loop. */
507 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
508# endif
509L(prepare_loop_no_len):
510
511 /* Align s1 and adjust s2 accordingly. */
512 subq %rdi, %rsi
513 andq $-(VEC_SIZE * 4), %rdi
514 addq %rdi, %rsi
515
516# ifdef USE_AS_STRNCMP
517 subq %rdi, %rdx
518# endif
519
520L(prepare_loop_aligned):
521 /* eax stores distance from rsi to next page cross. These cases
522 need to be handled specially as the 4x loop could potentially
523 read memory past the length of s1 or s2 and across a page
524 boundary. */
525 movl $-(VEC_SIZE * 4), %eax
526 subl %esi, %eax
527 andl $(PAGE_SIZE - 1), %eax
528
529 /* Loop 4x comparisons at a time. */
530 .p2align 4
531L(loop):
532
533 /* End condition for strncmp. */
534# ifdef USE_AS_STRNCMP
535 subq $(VEC_SIZE * 4), %rdx
536 jbe L(ret_zero)
537# endif
538
539 subq $-(VEC_SIZE * 4), %rdi
540 subq $-(VEC_SIZE * 4), %rsi
541
542 /* Check if rsi loads will cross a page boundary. */
543 addl $-(VEC_SIZE * 4), %eax
544 jnb L(page_cross_during_loop)
545
546 /* Loop entry after handling page cross during loop. */
547L(loop_skip_page_cross_check):
548 VMOVA (VEC_SIZE * 0)(%rdi), %ymm0
549 VMOVA (VEC_SIZE * 1)(%rdi), %ymm2
550 VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
551 VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
552
553 /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
554 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
555 CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
556 CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
557 CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
558
559 /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
560 zero. */
561 vpand %ymm0, %ymm1, %ymm1
562
563
564 vpand %ymm2, %ymm3, %ymm3
565 vpand %ymm4, %ymm5, %ymm5
566 vpand %ymm6, %ymm7, %ymm7
567
568 VPMINU %ymm1, %ymm3, %ymm3
569 VPMINU %ymm5, %ymm7, %ymm7
570
571 /* Reduce all 0 CHARs for the 4x VEC into ymm7. */
572 VPMINU %ymm3, %ymm7, %ymm7
573
574 /* If any 0 CHAR then done. */
575 VPCMPEQ %ymm7, %ymmZERO, %ymm7
576 vpmovmskb %ymm7, %LOOP_REG
577 testl %LOOP_REG, %LOOP_REG
578 jz L(loop)
579
580 /* Find which VEC has the mismatch of end of string. */
581 VPCMPEQ %ymm1, %ymmZERO, %ymm1
582 vpmovmskb %ymm1, %ecx
583 testl %ecx, %ecx
584 jnz L(return_vec_0_end)
585
586
587 VPCMPEQ %ymm3, %ymmZERO, %ymm3
588 vpmovmskb %ymm3, %ecx
589 testl %ecx, %ecx
590 jnz L(return_vec_1_end)
591
592L(return_vec_2_3_end):
593# ifdef USE_AS_STRNCMP
594 subq $(VEC_SIZE * 2), %rdx
595 jbe L(ret_zero_end)
596# endif
597
598 VPCMPEQ %ymm5, %ymmZERO, %ymm5
599 vpmovmskb %ymm5, %ecx
600 testl %ecx, %ecx
601 jnz L(return_vec_2_end)
602
603 /* LOOP_REG contains matches for null/mismatch from the loop. If
604 VEC 0,1,and 2 all have no null and no mismatches then mismatch
605 must entirely be from VEC 3 which is fully represented by
606 LOOP_REG. */
607 tzcntl %LOOP_REG, %LOOP_REG
608
609# ifdef USE_AS_STRNCMP
610 subl $-(VEC_SIZE), %LOOP_REG
611 cmpq %LOOP_REG64, %rdx
612 jbe L(ret_zero_end)
613# endif
614
615# ifdef USE_AS_WCSCMP
616 movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
617 xorl %eax, %eax
618 cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
619 je L(ret5)
620 setl %al
621 negl %eax
622 xorl %r8d, %eax
623# else
624 movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
625 movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
626 TOLOWER_gpr (%rax, %eax)
627 TOLOWER_gpr (%rcx, %ecx)
628 subl %ecx, %eax
629 xorl %r8d, %eax
630 subl %r8d, %eax
631# endif
632L(ret5):
633 VZEROUPPER_RETURN
634
635# ifdef USE_AS_STRNCMP
636 .p2align 4,, 2
637L(ret_zero_end):
638 xorl %eax, %eax
639 VZEROUPPER_RETURN
640# endif
641
642
643 /* The L(return_vec_N_end) differ from L(return_vec_N) in that
644 they use the value of `r8` to negate the return value. This is
645 because the page cross logic can swap `rdi` and `rsi`. */
646 .p2align 4,, 10
647# ifdef USE_AS_STRNCMP
648L(return_vec_1_end):
649 salq $32, %rcx
650# endif
651L(return_vec_0_end):
652# ifndef USE_AS_STRNCMP
653 tzcntl %ecx, %ecx
654# else
655 tzcntq %rcx, %rcx
656 cmpq %rcx, %rdx
657 jbe L(ret_zero_end)
658# endif
659
660# ifdef USE_AS_WCSCMP
661 movl (%rdi, %rcx), %edx
662 xorl %eax, %eax
663 cmpl (%rsi, %rcx), %edx
664 je L(ret6)
665 setl %al
666 negl %eax
667 xorl %r8d, %eax
668# else
669 movzbl (%rdi, %rcx), %eax
670 movzbl (%rsi, %rcx), %ecx
671 TOLOWER_gpr (%rax, %eax)
672 TOLOWER_gpr (%rcx, %ecx)
673 subl %ecx, %eax
674 xorl %r8d, %eax
675 subl %r8d, %eax
676# endif
677L(ret6):
678 VZEROUPPER_RETURN
679
680# ifndef USE_AS_STRNCMP
681 .p2align 4,, 10
682L(return_vec_1_end):
683 tzcntl %ecx, %ecx
684# ifdef USE_AS_WCSCMP
685 movl VEC_SIZE(%rdi, %rcx), %edx
686 xorl %eax, %eax
687 cmpl VEC_SIZE(%rsi, %rcx), %edx
688 je L(ret7)
689 setl %al
690 negl %eax
691 xorl %r8d, %eax
692# else
693 movzbl VEC_SIZE(%rdi, %rcx), %eax
694 movzbl VEC_SIZE(%rsi, %rcx), %ecx
695 TOLOWER_gpr (%rax, %eax)
696 TOLOWER_gpr (%rcx, %ecx)
697 subl %ecx, %eax
698 xorl %r8d, %eax
699 subl %r8d, %eax
700# endif
701L(ret7):
702 VZEROUPPER_RETURN
703# endif
704
705 .p2align 4,, 10
706L(return_vec_2_end):
707 tzcntl %ecx, %ecx
708# ifdef USE_AS_STRNCMP
709 cmpq %rcx, %rdx
710 jbe L(ret_zero_page_cross)
711# endif
712# ifdef USE_AS_WCSCMP
713 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
714 xorl %eax, %eax
715 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
716 je L(ret11)
717 setl %al
718 negl %eax
719 xorl %r8d, %eax
720# else
721 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
722 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
723 TOLOWER_gpr (%rax, %eax)
724 TOLOWER_gpr (%rcx, %ecx)
725 subl %ecx, %eax
726 xorl %r8d, %eax
727 subl %r8d, %eax
728# endif
729L(ret11):
730 VZEROUPPER_RETURN
731
732
733 /* Page cross in rsi in next 4x VEC. */
734
735 /* TODO: Improve logic here. */
736 .p2align 4,, 10
737L(page_cross_during_loop):
738 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
739
740 /* Optimistically rsi and rdi and both aligned inwhich case we
741 don't need any logic here. */
742 cmpl $-(VEC_SIZE * 4), %eax
743 /* Don't adjust eax before jumping back to loop and we will
744 never hit page cross case again. */
745 je L(loop_skip_page_cross_check)
746
747 /* Check if we can safely load a VEC. */
748 cmpl $-(VEC_SIZE * 3), %eax
749 jle L(less_1x_vec_till_page_cross)
750
751 VMOVA (%rdi), %ymm0
752 CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
753 VPCMPEQ %ymm0, %ymmZERO, %ymm2
754 vpandn %ymm1, %ymm2, %ymm1
755 vpmovmskb %ymm1, %ecx
756 incl %ecx
757 jnz L(return_vec_0_end)
758
759 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
760 cmpl $-(VEC_SIZE * 2), %eax
761 jg L(more_2x_vec_till_page_cross)
762
763 .p2align 4,, 4
764L(less_1x_vec_till_page_cross):
765 subl $-(VEC_SIZE * 4), %eax
766 /* Guranteed safe to read from rdi - VEC_SIZE here. The only
767 concerning case is first iteration if incoming s1 was near start
768 of a page and s2 near end. If s1 was near the start of the page
769 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
770 to read back -VEC_SIZE. If rdi is truly at the start of a page
771 here, it means the previous page (rdi - VEC_SIZE) has already
772 been loaded earlier so must be valid. */
773 VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
774 CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
775 VPCMPEQ %ymm0, %ymmZERO, %ymm2
776 vpandn %ymm1, %ymm2, %ymm1
777 vpmovmskb %ymm1, %ecx
778
779 /* Mask of potentially valid bits. The lower bits can be out of
780 range comparisons (but safe regarding page crosses). */
781 movl $-1, %r10d
782 shlxl %esi, %r10d, %r10d
783 notl %ecx
784
785# ifdef USE_AS_STRNCMP
786 cmpq %rax, %rdx
787 jbe L(return_page_cross_end_check)
788# endif
789 movl %eax, %OFFSET_REG
790 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
791
792 andl %r10d, %ecx
793 jz L(loop_skip_page_cross_check)
794
795 .p2align 4,, 3
796L(return_page_cross_end):
797 tzcntl %ecx, %ecx
798
799# ifdef USE_AS_STRNCMP
800 leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
801L(return_page_cross_cmp_mem):
802# else
803 addl %OFFSET_REG, %ecx
804# endif
805# ifdef USE_AS_WCSCMP
806 movl VEC_OFFSET(%rdi, %rcx), %edx
807 xorl %eax, %eax
808 cmpl VEC_OFFSET(%rsi, %rcx), %edx
809 je L(ret8)
810 setl %al
811 negl %eax
812 xorl %r8d, %eax
813# else
814 movzbl VEC_OFFSET(%rdi, %rcx), %eax
815 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
816 TOLOWER_gpr (%rax, %eax)
817 TOLOWER_gpr (%rcx, %ecx)
818 subl %ecx, %eax
819 xorl %r8d, %eax
820 subl %r8d, %eax
821# endif
822L(ret8):
823 VZEROUPPER_RETURN
824
825# ifdef USE_AS_STRNCMP
826 .p2align 4,, 10
827L(return_page_cross_end_check):
828 andl %r10d, %ecx
829 tzcntl %ecx, %ecx
830 leal -VEC_SIZE(%rax, %rcx), %ecx
831 cmpl %ecx, %edx
832 ja L(return_page_cross_cmp_mem)
833 xorl %eax, %eax
834 VZEROUPPER_RETURN
835# endif
836
837
838 .p2align 4,, 10
839L(more_2x_vec_till_page_cross):
840 /* If more 2x vec till cross we will complete a full loop
841 iteration here. */
842
843 VMOVU VEC_SIZE(%rdi), %ymm0
844 CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
845 VPCMPEQ %ymm0, %ymmZERO, %ymm2
846 vpandn %ymm1, %ymm2, %ymm1
847 vpmovmskb %ymm1, %ecx
848 incl %ecx
849 jnz L(return_vec_1_end)
850
851# ifdef USE_AS_STRNCMP
852 cmpq $(VEC_SIZE * 2), %rdx
853 jbe L(ret_zero_in_loop_page_cross)
854# endif
855
856 subl $-(VEC_SIZE * 4), %eax
857
858 /* Safe to include comparisons from lower bytes. */
859 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
860 CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
861 VPCMPEQ %ymm0, %ymmZERO, %ymm2
862 vpandn %ymm1, %ymm2, %ymm1
863 vpmovmskb %ymm1, %ecx
864 incl %ecx
865 jnz L(return_vec_page_cross_0)
866
867 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
868 CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
869 VPCMPEQ %ymm0, %ymmZERO, %ymm2
870 vpandn %ymm1, %ymm2, %ymm1
871 vpmovmskb %ymm1, %ecx
872 incl %ecx
873 jnz L(return_vec_page_cross_1)
874
875# ifdef USE_AS_STRNCMP
876 /* Must check length here as length might proclude reading next
877 page. */
878 cmpq %rax, %rdx
879 jbe L(ret_zero_in_loop_page_cross)
880# endif
881
882 /* Finish the loop. */
883 VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
884 VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
885
886 CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
887 CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
888 vpand %ymm4, %ymm5, %ymm5
889 vpand %ymm6, %ymm7, %ymm7
890 VPMINU %ymm5, %ymm7, %ymm7
891 VPCMPEQ %ymm7, %ymmZERO, %ymm7
892 vpmovmskb %ymm7, %LOOP_REG
893 testl %LOOP_REG, %LOOP_REG
894 jnz L(return_vec_2_3_end)
895
896 /* Best for code size to include ucond-jmp here. Would be faster
897 if this case is hot to duplicate the L(return_vec_2_3_end) code
898 as fall-through and have jump back to loop on mismatch
899 comparison. */
900 subq $-(VEC_SIZE * 4), %rdi
901 subq $-(VEC_SIZE * 4), %rsi
902 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
903# ifdef USE_AS_STRNCMP
904 subq $(VEC_SIZE * 4), %rdx
905 ja L(loop_skip_page_cross_check)
906L(ret_zero_in_loop_page_cross):
907 xorl %eax, %eax
908 VZEROUPPER_RETURN
909# else
910 jmp L(loop_skip_page_cross_check)
911# endif
912
913
914 .p2align 4,, 10
915L(return_vec_page_cross_0):
916 addl $-VEC_SIZE, %eax
917L(return_vec_page_cross_1):
918 tzcntl %ecx, %ecx
919# ifdef USE_AS_STRNCMP
920 leal -VEC_SIZE(%rax, %rcx), %ecx
921 cmpq %rcx, %rdx
922 jbe L(ret_zero_in_loop_page_cross)
923# else
924 addl %eax, %ecx
925# endif
926
927# ifdef USE_AS_WCSCMP
928 movl VEC_OFFSET(%rdi, %rcx), %edx
929 xorl %eax, %eax
930 cmpl VEC_OFFSET(%rsi, %rcx), %edx
931 je L(ret9)
932 setl %al
933 negl %eax
934 xorl %r8d, %eax
935# else
936 movzbl VEC_OFFSET(%rdi, %rcx), %eax
937 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
938 TOLOWER_gpr (%rax, %eax)
939 TOLOWER_gpr (%rcx, %ecx)
940 subl %ecx, %eax
941 xorl %r8d, %eax
942 subl %r8d, %eax
943# endif
944L(ret9):
945 VZEROUPPER_RETURN
946
947
948 .p2align 4,, 10
949L(page_cross):
950# ifndef USE_AS_STRNCMP
951 /* If both are VEC aligned we don't need any special logic here.
952 Only valid for strcmp where stop condition is guranteed to be
953 reachable by just reading memory. */
954 testl $((VEC_SIZE - 1) << 20), %eax
955 jz L(no_page_cross)
956# endif
957
958 movl %edi, %eax
959 movl %esi, %ecx
960 andl $(PAGE_SIZE - 1), %eax
961 andl $(PAGE_SIZE - 1), %ecx
962
963 xorl %OFFSET_REG, %OFFSET_REG
964
965 /* Check which is closer to page cross, s1 or s2. */
966 cmpl %eax, %ecx
967 jg L(page_cross_s2)
968
969 /* The previous page cross check has false positives. Check for
970 true positive as page cross logic is very expensive. */
971 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
972 jbe L(no_page_cross)
973
974 /* Set r8 to not interfere with normal return value (rdi and rsi
975 did not swap). */
976# ifdef USE_AS_WCSCMP
977 /* any non-zero positive value that doesn't inference with 0x1.
978 */
979 movl $2, %r8d
980# else
981 xorl %r8d, %r8d
982# endif
983
984 /* Check if less than 1x VEC till page cross. */
985 subl $(VEC_SIZE * 3), %eax
986 jg L(less_1x_vec_till_page)
987
988 /* If more than 1x VEC till page cross, loop throuh safely
989 loadable memory until within 1x VEC of page cross. */
990
991 .p2align 4,, 10
992L(page_cross_loop):
993
994 VMOVU (%rdi, %OFFSET_REG64), %ymm0
995 CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
996 VPCMPEQ %ymm0, %ymmZERO, %ymm2
997 vpandn %ymm1, %ymm2, %ymm1
998 vpmovmskb %ymm1, %ecx
999 incl %ecx
1000
1001 jnz L(check_ret_vec_page_cross)
1002 addl $VEC_SIZE, %OFFSET_REG
1003# ifdef USE_AS_STRNCMP
1004 cmpq %OFFSET_REG64, %rdx
1005 jbe L(ret_zero_page_cross)
1006# endif
1007 addl $VEC_SIZE, %eax
1008 jl L(page_cross_loop)
1009
1010 subl %eax, %OFFSET_REG
1011 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1012 to not cross page so is safe to load. Since we have already
1013 loaded at least 1 VEC from rsi it is also guranteed to be
1014 safe. */
1015
1016 VMOVU (%rdi, %OFFSET_REG64), %ymm0
1017 CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
1018 VPCMPEQ %ymm0, %ymmZERO, %ymm2
1019 vpandn %ymm1, %ymm2, %ymm1
1020 vpmovmskb %ymm1, %ecx
1021
1022# ifdef USE_AS_STRNCMP
1023 leal VEC_SIZE(%OFFSET_REG64), %eax
1024 cmpq %rax, %rdx
1025 jbe L(check_ret_vec_page_cross2)
1026 addq %rdi, %rdx
1027# endif
1028 incl %ecx
1029 jz L(prepare_loop_no_len)
1030
1031 .p2align 4,, 4
1032L(ret_vec_page_cross):
1033# ifndef USE_AS_STRNCMP
1034L(check_ret_vec_page_cross):
1035# endif
1036 tzcntl %ecx, %ecx
1037 addl %OFFSET_REG, %ecx
1038L(ret_vec_page_cross_cont):
1039# ifdef USE_AS_WCSCMP
1040 movl (%rdi, %rcx), %edx
1041 xorl %eax, %eax
1042 cmpl (%rsi, %rcx), %edx
1043 je L(ret12)
1044 setl %al
1045 negl %eax
1046 xorl %r8d, %eax
1047# else
1048 movzbl (%rdi, %rcx), %eax
1049 movzbl (%rsi, %rcx), %ecx
1050 TOLOWER_gpr (%rax, %eax)
1051 TOLOWER_gpr (%rcx, %ecx)
1052 subl %ecx, %eax
1053 xorl %r8d, %eax
1054 subl %r8d, %eax
1055# endif
1056L(ret12):
1057 VZEROUPPER_RETURN
1058
1059# ifdef USE_AS_STRNCMP
1060 .p2align 4,, 10
1061L(check_ret_vec_page_cross2):
1062 incl %ecx
1063L(check_ret_vec_page_cross):
1064 tzcntl %ecx, %ecx
1065 addl %OFFSET_REG, %ecx
1066 cmpq %rcx, %rdx
1067 ja L(ret_vec_page_cross_cont)
1068 .p2align 4,, 2
1069L(ret_zero_page_cross):
1070 xorl %eax, %eax
1071 VZEROUPPER_RETURN
1072# endif
1073
1074 .p2align 4,, 4
1075L(page_cross_s2):
1076 /* Ensure this is a true page cross. */
1077 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1078 jbe L(no_page_cross)
1079
1080
1081 movl %ecx, %eax
1082 movq %rdi, %rcx
1083 movq %rsi, %rdi
1084 movq %rcx, %rsi
1085
1086 /* set r8 to negate return value as rdi and rsi swapped. */
1087# ifdef USE_AS_WCSCMP
1088 movl $-4, %r8d
1089# else
1090 movl $-1, %r8d
1091# endif
1092 xorl %OFFSET_REG, %OFFSET_REG
1093
1094 /* Check if more than 1x VEC till page cross. */
1095 subl $(VEC_SIZE * 3), %eax
1096 jle L(page_cross_loop)
1097
1098 .p2align 4,, 6
1099L(less_1x_vec_till_page):
1100 /* Find largest load size we can use. */
1101 cmpl $16, %eax
1102 ja L(less_16_till_page)
1103
1104 VMOVU (%rdi), %xmm0
1105 CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
1106 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1107 vpandn %xmm1, %xmm2, %xmm1
1108 vpmovmskb %ymm1, %ecx
1109 incw %cx
1110 jnz L(check_ret_vec_page_cross)
1111 movl $16, %OFFSET_REG
1112# ifdef USE_AS_STRNCMP
1113 cmpq %OFFSET_REG64, %rdx
1114 jbe L(ret_zero_page_cross_slow_case0)
1115 subl %eax, %OFFSET_REG
1116# else
1117 /* Explicit check for 16 byte alignment. */
1118 subl %eax, %OFFSET_REG
1119 jz L(prepare_loop)
1120# endif
1121
1122 VMOVU (%rdi, %OFFSET_REG64), %xmm0
1123 CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
1124 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1125 vpandn %xmm1, %xmm2, %xmm1
1126 vpmovmskb %ymm1, %ecx
1127 incw %cx
1128 jnz L(check_ret_vec_page_cross)
1129
1130# ifdef USE_AS_STRNCMP
1131 addl $16, %OFFSET_REG
1132 subq %OFFSET_REG64, %rdx
1133 jbe L(ret_zero_page_cross_slow_case0)
1134 subq $-(VEC_SIZE * 4), %rdx
1135
1136 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1137 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1138# else
1139 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1140 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1141# endif
1142 jmp L(prepare_loop_aligned)
1143
1144# ifdef USE_AS_STRNCMP
1145 .p2align 4,, 2
1146L(ret_zero_page_cross_slow_case0):
1147 xorl %eax, %eax
1148 ret
1149# endif
1150
1151
1152 .p2align 4,, 10
1153L(less_16_till_page):
1154 /* Find largest load size we can use. */
1155 cmpl $24, %eax
1156 ja L(less_8_till_page)
1157
1158 vmovq (%rdi), %xmm0
1159 vmovq (%rsi), %xmm1
1160 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1161 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1162 vpandn %xmm1, %xmm2, %xmm1
1163 vpmovmskb %ymm1, %ecx
1164 incb %cl
1165 jnz L(check_ret_vec_page_cross)
1166
1167
1168# ifdef USE_AS_STRNCMP
1169 cmpq $8, %rdx
1170 jbe L(ret_zero_page_cross_slow_case0)
1171# endif
1172 movl $24, %OFFSET_REG
1173 /* Explicit check for 16 byte alignment. */
1174 subl %eax, %OFFSET_REG
1175
1176
1177
1178 vmovq (%rdi, %OFFSET_REG64), %xmm0
1179 vmovq (%rsi, %OFFSET_REG64), %xmm1
1180 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1181 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1182 vpandn %xmm1, %xmm2, %xmm1
1183 vpmovmskb %ymm1, %ecx
1184 incb %cl
1185 jnz L(check_ret_vec_page_cross)
1186
1187# ifdef USE_AS_STRNCMP
1188 addl $8, %OFFSET_REG
1189 subq %OFFSET_REG64, %rdx
1190 jbe L(ret_zero_page_cross_slow_case0)
1191 subq $-(VEC_SIZE * 4), %rdx
1192
1193 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1194 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1195# else
1196 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1197 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1198# endif
1199 jmp L(prepare_loop_aligned)
1200
1201
1202 .p2align 4,, 10
1203L(less_8_till_page):
1204# ifdef USE_AS_WCSCMP
1205 /* If using wchar then this is the only check before we reach
1206 the page boundary. */
1207 movl (%rdi), %eax
1208 movl (%rsi), %ecx
1209 cmpl %ecx, %eax
1210 jnz L(ret_less_8_wcs)
1211# ifdef USE_AS_STRNCMP
1212 addq %rdi, %rdx
1213 /* We already checked for len <= 1 so cannot hit that case here.
1214 */
1215# endif
1216 testl %eax, %eax
1217 jnz L(prepare_loop_no_len)
1218 ret
1219
1220 .p2align 4,, 8
1221L(ret_less_8_wcs):
1222 setl %OFFSET_REG8
1223 negl %OFFSET_REG
1224 movl %OFFSET_REG, %eax
1225 xorl %r8d, %eax
1226 ret
1227
1228# else
1229
1230 /* Find largest load size we can use. */
1231 cmpl $28, %eax
1232 ja L(less_4_till_page)
1233
1234 vmovd (%rdi), %xmm0
1235 vmovd (%rsi), %xmm1
1236 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1237 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1238 vpandn %xmm1, %xmm2, %xmm1
1239 vpmovmskb %ymm1, %ecx
1240 subl $0xf, %ecx
1241 jnz L(check_ret_vec_page_cross)
1242
1243# ifdef USE_AS_STRNCMP
1244 cmpq $4, %rdx
1245 jbe L(ret_zero_page_cross_slow_case1)
1246# endif
1247 movl $28, %OFFSET_REG
1248 /* Explicit check for 16 byte alignment. */
1249 subl %eax, %OFFSET_REG
1250
1251
1252
1253 vmovd (%rdi, %OFFSET_REG64), %xmm0
1254 vmovd (%rsi, %OFFSET_REG64), %xmm1
1255 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1256 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1257 vpandn %xmm1, %xmm2, %xmm1
1258 vpmovmskb %ymm1, %ecx
1259 subl $0xf, %ecx
1260 jnz L(check_ret_vec_page_cross)
1261
1262# ifdef USE_AS_STRNCMP
1263 addl $4, %OFFSET_REG
1264 subq %OFFSET_REG64, %rdx
1265 jbe L(ret_zero_page_cross_slow_case1)
1266 subq $-(VEC_SIZE * 4), %rdx
1267
1268 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1269 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1270# else
1271 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1272 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1273# endif
1274 jmp L(prepare_loop_aligned)
1275
1276# ifdef USE_AS_STRNCMP
1277 .p2align 4,, 2
1278L(ret_zero_page_cross_slow_case1):
1279 xorl %eax, %eax
1280 ret
1281# endif
1282
1283 .p2align 4,, 10
1284L(less_4_till_page):
1285 subq %rdi, %rsi
1286 /* Extremely slow byte comparison loop. */
1287L(less_4_loop):
1288 movzbl (%rdi), %eax
1289 movzbl (%rsi, %rdi), %ecx
1290 TOLOWER_gpr (%rax, %eax)
1291 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1292 subl %BYTE_LOOP_REG, %eax
1293 jnz L(ret_less_4_loop)
1294 testl %ecx, %ecx
1295 jz L(ret_zero_4_loop)
1296# ifdef USE_AS_STRNCMP
1297 decq %rdx
1298 jz L(ret_zero_4_loop)
1299# endif
1300 incq %rdi
1301 /* end condition is reach page boundary (rdi is aligned). */
1302 testl $31, %edi
1303 jnz L(less_4_loop)
1304 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1305 addq $-(VEC_SIZE * 4), %rdi
1306# ifdef USE_AS_STRNCMP
1307 subq $-(VEC_SIZE * 4), %rdx
1308# endif
1309 jmp L(prepare_loop_aligned)
1310
1311L(ret_zero_4_loop):
1312 xorl %eax, %eax
1313 ret
1314L(ret_less_4_loop):
1315 xorl %r8d, %eax
1316 subl %r8d, %eax
1317 ret
1318# endif
1319 cfi_endproc
1320 .size STRCMP, .-STRCMP
1321#endif
1322

source code of glibc/sysdeps/x86_64/multiarch/strcmp-avx2.S