1/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# ifndef VEC_SIZE
24# include "x86-evex256-vecs.h"
25# endif
26
27# define STRCMP_ISA _evex
28# include "strcmp-naming.h"
29
30# include <sysdep.h>
31# if defined USE_AS_STRCASECMP_L
32# include "locale-defines.h"
33# endif
34
35# ifndef STRCMP
36# define STRCMP __strcmp_evex
37# endif
38
39# define PAGE_SIZE 4096
40
41 /* VEC_SIZE = Number of bytes in a ymm register. */
42# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
43
44# ifdef USE_AS_WCSCMP
45 /* Compare packed dwords. */
46# define VPCMP vpcmpd
47# define VPCMPEQ vpcmpeqd
48# define VPMINU vpminud
49# define VPTESTM vptestmd
50# define VPTESTNM vptestnmd
51 /* 1 dword char == 4 bytes. */
52# define SIZE_OF_CHAR 4
53
54# define TESTEQ sub $((1 << CHAR_PER_VEC) - 1),
55
56# define USE_WIDE_CHAR
57# else
58 /* Compare packed bytes. */
59# define VPCMP vpcmpb
60# define VPCMPEQ vpcmpeqb
61# define VPMINU vpminub
62# define VPTESTM vptestmb
63# define VPTESTNM vptestnmb
64 /* 1 byte char == 1 byte. */
65# define SIZE_OF_CHAR 1
66
67# define TESTEQ inc
68# endif
69
70# include "reg-macros.h"
71
72# if VEC_SIZE == 64
73# define RODATA_SECTION rodata.cst64
74# else
75# define RODATA_SECTION rodata.cst32
76# endif
77
78# if CHAR_PER_VEC == 64
79# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 3)
80# else
81# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 2)
82# endif
83
84# ifdef USE_AS_STRNCMP
85# define LOOP_REG VR9
86# define LOOP_REG64 r9
87
88# define OFFSET_REG8 r9b
89# define OFFSET_REG r9d
90# define OFFSET_REG64 r9
91# else
92# define LOOP_REG VRDX
93# define LOOP_REG64 rdx
94
95# define OFFSET_REG8 dl
96# define OFFSET_REG edx
97# define OFFSET_REG64 rdx
98# endif
99
100# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
101# define VEC_OFFSET 0
102# else
103# define VEC_OFFSET (-VEC_SIZE)
104# endif
105
106# ifdef USE_AS_STRCASECMP_L
107# define BYTE_LOOP_REG OFFSET_REG
108# else
109# define BYTE_LOOP_REG ecx
110# endif
111
112# ifdef USE_AS_STRCASECMP_L
113# ifdef USE_AS_STRNCMP
114# define LOCALE_REG rcx
115# define LOCALE_REG_LP RCX_LP
116# else
117# define LOCALE_REG rdx
118# define LOCALE_REG_LP RDX_LP
119# endif
120# endif
121
122# define LCASE_MIN_V VMM(12)
123# define LCASE_MAX_V VMM(13)
124# define CASE_ADD_V VMM(14)
125
126# if VEC_SIZE == 64
127# define LCASE_MIN_YMM VMM_256(12)
128# define LCASE_MAX_YMM VMM_256(13)
129# define CASE_ADD_YMM VMM_256(14)
130# endif
131
132# define LCASE_MIN_XMM VMM_128(12)
133# define LCASE_MAX_XMM VMM_128(13)
134# define CASE_ADD_XMM VMM_128(14)
135
136 /* NB: wcsncmp uses r11 but strcasecmp is never used in
137 conjunction with wcscmp. */
138# define TOLOWER_BASE %r11
139
140# ifdef USE_AS_STRCASECMP_L
141# define _REG(x, y) x ## y
142# define REG(x, y) _REG(x, y)
143# define TOLOWER(reg1, reg2, ext, vec_macro) \
144 vpsubb %REG(LCASE_MIN_, ext), reg1, %vec_macro(10); \
145 vpsubb %REG(LCASE_MIN_, ext), reg2, %vec_macro(11); \
146 vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \
147 vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \
148 vpaddb reg1, %REG(CASE_ADD_, ext), reg1{%k5}; \
149 vpaddb reg2, %REG(CASE_ADD_, ext), reg2{%k6}
150
151# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
152# define TOLOWER_VMM(...) TOLOWER(__VA_ARGS__, V, VMM)
153# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM, VMM_256)
154# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM, VMM_128)
155
156# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro) \
157 TOLOWER (s1_reg, s2_reg, ext, vec_macro); \
158 VPCMPEQ s1_reg, s2_reg, reg_out
159
160# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro) \
161 VMOVU s2_mem, s2_reg; \
162 CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
163
164# define CMP_R1_R2_VMM(...) CMP_R1_R2(__VA_ARGS__, V, VMM)
165# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
166# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
167
168# define CMP_R1_S2_VMM(...) CMP_R1_S2(__VA_ARGS__, V, VMM)
169# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
170# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
171
172# else
173# define TOLOWER_gpr(...)
174# define TOLOWER_VMM(...)
175# define TOLOWER_YMM(...)
176# define TOLOWER_XMM(...)
177
178# define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out) \
179 VPCMPEQ s2_reg, s1_reg, reg_out
180
181# define CMP_R1_R2_YMM(...) CMP_R1_R2_VMM(__VA_ARGS__)
182# define CMP_R1_R2_XMM(...) CMP_R1_R2_VMM(__VA_ARGS__)
183
184# define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out) \
185 VPCMPEQ s2_mem, s1_reg, reg_out
186# define CMP_R1_S2_YMM(...) CMP_R1_S2_VMM(__VA_ARGS__)
187# define CMP_R1_S2_XMM(...) CMP_R1_S2_VMM(__VA_ARGS__)
188# endif
189
190/* Warning!
191 wcscmp/wcsncmp have to use SIGNED comparison for elements.
192 strcmp/strncmp have to use UNSIGNED comparison for elements.
193*/
194
195/* The main idea of the string comparison (byte or dword) using 256-bit
196 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
197 latter can be on either packed bytes or dwords depending on
198 USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
199 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
200 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
201 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
202 instructions. Main loop (away from from page boundary) compares 4
203 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
204 bytes) on each loop.
205
206 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
207 is the same as strcmp, except that an a maximum offset is tracked. If
208 the maximum offset is reached before a difference is found, zero is
209 returned. */
210
211 .section SECTION(.text), "ax", @progbits
212 .align 16
213 .type STRCMP, @function
214 .globl STRCMP
215# ifdef USE_AS_STRCASECMP_L
216ENTRY (STRCASECMP)
217 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
218 mov %fs:(%rax), %LOCALE_REG_LP
219
220 /* Either 1 or 5 bytes (depending if CET is enabled). */
221 .p2align 4
222END (STRCASECMP)
223 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
224# endif
225
226 .p2align 4
227STRCMP:
228 cfi_startproc
229 _CET_ENDBR
230 CALL_MCOUNT
231
232# if defined USE_AS_STRCASECMP_L
233 /* We have to fall back on the C implementation for locales with
234 encodings not matching ASCII for single bytes. */
235# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
236 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
237# else
238 mov (%LOCALE_REG), %RAX_LP
239# endif
240 testb $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
241 jne STRCASECMP_L_NONASCII
242 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
243# endif
244
245# ifdef USE_AS_STRNCMP
246 /* Don't overwrite LOCALE_REG (rcx) until we have pass
247 L(one_or_less). Otherwise we might use the wrong locale in
248 the OVERFLOW_STRCMP (strcasecmp_l). */
249# ifdef __ILP32__
250 /* Clear the upper 32 bits. */
251 movl %edx, %edx
252# endif
253 cmp $1, %RDX_LP
254 /* Signed comparison intentional. We use this branch to also
255 test cases where length >= 2^63. These very large sizes can be
256 handled with strcmp as there is no way for that length to
257 actually bound the buffer. */
258 jle L(one_or_less)
259# endif
260
261# if defined USE_AS_STRCASECMP_L
262 .section RODATA_SECTION, "aM", @progbits, VEC_SIZE
263 .align VEC_SIZE
264L(lcase_min):
265 .quad 0x4141414141414141
266 .quad 0x4141414141414141
267 .quad 0x4141414141414141
268 .quad 0x4141414141414141
269# if VEC_SIZE == 64
270 .quad 0x4141414141414141
271 .quad 0x4141414141414141
272 .quad 0x4141414141414141
273 .quad 0x4141414141414141
274# endif
275L(lcase_max):
276 .quad 0x1a1a1a1a1a1a1a1a
277 .quad 0x1a1a1a1a1a1a1a1a
278 .quad 0x1a1a1a1a1a1a1a1a
279 .quad 0x1a1a1a1a1a1a1a1a
280# if VEC_SIZE == 64
281 .quad 0x1a1a1a1a1a1a1a1a
282 .quad 0x1a1a1a1a1a1a1a1a
283 .quad 0x1a1a1a1a1a1a1a1a
284 .quad 0x1a1a1a1a1a1a1a1a
285# endif
286L(case_add):
287 .quad 0x2020202020202020
288 .quad 0x2020202020202020
289 .quad 0x2020202020202020
290 .quad 0x2020202020202020
291# if VEC_SIZE == 64
292 .quad 0x2020202020202020
293 .quad 0x2020202020202020
294 .quad 0x2020202020202020
295 .quad 0x2020202020202020
296# endif
297 .previous
298
299 VMOVA L(lcase_min)(%rip), %LCASE_MIN_V
300 VMOVA L(lcase_max)(%rip), %LCASE_MAX_V
301 VMOVA L(case_add)(%rip), %CASE_ADD_V
302# endif
303
304 movl %edi, %eax
305 orl %esi, %eax
306 /* Shift out the bits irrelivant to page boundary ([63:12]). */
307 sall $20, %eax
308 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
309 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
310 ja L(page_cross)
311
312L(no_page_cross):
313 /* Safe to compare 4x vectors. */
314 VMOVU (%rdi), %VMM(0)
315 VPTESTM %VMM(0), %VMM(0), %k2
316 /* Each bit cleared in K1 represents a mismatch or a null CHAR
317 in YMM0 and 32 bytes at (%rsi). */
318 CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
319 KMOV %k1, %VRCX
320# ifdef USE_AS_STRNCMP
321 cmpq $CHAR_PER_VEC, %rdx
322 jbe L(vec_0_test_len)
323# endif
324
325 /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
326 wcscmp/wcsncmp. */
327
328 /* All 1s represents all equals. TESTEQ will overflow to zero in
329 all equals case. Otherwise 1s will carry until position of
330 first mismatch. */
331 TESTEQ %VRCX
332 jz L(more_3x_vec)
333
334 .p2align 4,, 4
335L(return_vec_0):
336 bsf %VRCX, %VRCX
337# ifdef USE_AS_WCSCMP
338 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
339 xorl %eax, %eax
340 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
341 je L(ret0)
342 setl %al
343 negl %eax
344 orl $1, %eax
345# else
346 movzbl (%rdi, %rcx), %eax
347 /* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
348 and keep logic for len <= VEC_SIZE (common) in just the
349 first cache line. NB: No evex512 processor has partial-
350 register stalls. If that changes this ifdef can be disabled
351 without affecting correctness. */
352# if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
353 movb (%rsi, %rcx), %cl
354# else
355 movzbl (%rsi, %rcx), %ecx
356# endif
357 TOLOWER_gpr (%rax, %eax)
358 TOLOWER_gpr (%rcx, %ecx)
359 subl %ecx, %eax
360# endif
361L(ret0):
362 ret
363
364# ifdef USE_AS_STRNCMP
365 .p2align 4,, 4
366L(vec_0_test_len):
367 not %VRCX
368 bzhi %VRDX, %VRCX, %VRAX
369 jnz L(return_vec_0)
370 /* Align if will cross fetch block. */
371 .p2align 4,, 2
372L(ret_zero):
373 xorl %eax, %eax
374 ret
375
376 .p2align 4,, 5
377L(one_or_less):
378# ifdef USE_AS_STRCASECMP_L
379 /* Set locale argument for strcasecmp. */
380 movq %LOCALE_REG, %rdx
381# endif
382 jb L(ret_zero)
383 /* 'nbe' covers the case where length is negative (large
384 unsigned). */
385 jnbe OVERFLOW_STRCMP
386# ifdef USE_AS_WCSCMP
387 movl (%rdi), %edx
388 xorl %eax, %eax
389 cmpl (%rsi), %edx
390 je L(ret1)
391 setl %al
392 negl %eax
393 orl $1, %eax
394# else
395 movzbl (%rdi), %eax
396 movzbl (%rsi), %ecx
397 TOLOWER_gpr (%rax, %eax)
398 TOLOWER_gpr (%rcx, %ecx)
399 subl %ecx, %eax
400# endif
401L(ret1):
402 ret
403# endif
404
405 .p2align 4,, 10
406L(return_vec_1):
407 bsf %VRCX, %VRCX
408# ifdef USE_AS_STRNCMP
409 /* rdx must be > CHAR_PER_VEC so its safe to subtract without
410 worrying about underflow. */
411 addq $-CHAR_PER_VEC, %rdx
412 cmpq %rcx, %rdx
413 jbe L(ret_zero)
414# endif
415# ifdef USE_AS_WCSCMP
416 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
417 xorl %eax, %eax
418 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
419 je L(ret2)
420 setl %al
421 negl %eax
422 orl $1, %eax
423# else
424 movzbl VEC_SIZE(%rdi, %rcx), %eax
425 movzbl VEC_SIZE(%rsi, %rcx), %ecx
426 TOLOWER_gpr (%rax, %eax)
427 TOLOWER_gpr (%rcx, %ecx)
428 subl %ecx, %eax
429# endif
430L(ret2):
431 ret
432
433 .p2align 4,, 10
434# ifdef USE_AS_STRNCMP
435L(return_vec_3):
436# if CHAR_PER_VEC <= 32
437 /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
438 additional branches by adjusting the bit positions from
439 VEC3. We can't do this for CHAR_PER_VEC == 64. */
440# if CHAR_PER_VEC <= 16
441 sall $CHAR_PER_VEC, %ecx
442# else
443 salq $CHAR_PER_VEC, %rcx
444# endif
445# else
446 /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
447 check it. */
448 bsf %VRCX, %VRCX
449 addl $(CHAR_PER_VEC), %ecx
450 cmpq %rcx, %rdx
451 ja L(ret_vec_3_finish)
452 xorl %eax, %eax
453 ret
454# endif
455# endif
456
457 /* If CHAR_PER_VEC == 64 we can't combine matches from the last
458 2x VEC so need separate return label. */
459L(return_vec_2):
460# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
461 bsf %VRCX, %VRCX
462# else
463 bsfq %rcx, %rcx
464# endif
465# ifdef USE_AS_STRNCMP
466 cmpq %rcx, %rdx
467 jbe L(ret_zero)
468# endif
469
470L(ret_vec_3_finish):
471# ifdef USE_AS_WCSCMP
472 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
473 xorl %eax, %eax
474 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
475 je L(ret3)
476 setl %al
477 negl %eax
478 orl $1, %eax
479# else
480 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
481 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
482 TOLOWER_gpr (%rax, %eax)
483 TOLOWER_gpr (%rcx, %ecx)
484 subl %ecx, %eax
485# endif
486L(ret3):
487 ret
488
489# ifndef USE_AS_STRNCMP
490 .p2align 4,, 10
491L(return_vec_3):
492 bsf %VRCX, %VRCX
493# ifdef USE_AS_WCSCMP
494 movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
495 xorl %eax, %eax
496 cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
497 je L(ret4)
498 setl %al
499 negl %eax
500 orl $1, %eax
501# else
502 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
503 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
504 TOLOWER_gpr (%rax, %eax)
505 TOLOWER_gpr (%rcx, %ecx)
506 subl %ecx, %eax
507# endif
508L(ret4):
509 ret
510# endif
511
512 /* 32 byte align here ensures the main loop is ideally aligned
513 for DSB. */
514 .p2align 5
515L(more_3x_vec):
516 /* Safe to compare 4x vectors. */
517 VMOVU (VEC_SIZE)(%rdi), %VMM(0)
518 VPTESTM %VMM(0), %VMM(0), %k2
519 CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
520 KMOV %k1, %VRCX
521 TESTEQ %VRCX
522 jnz L(return_vec_1)
523
524# ifdef USE_AS_STRNCMP
525 subq $(CHAR_PER_VEC * 2), %rdx
526 jbe L(ret_zero)
527# endif
528
529 VMOVU (VEC_SIZE * 2)(%rdi), %VMM(0)
530 VPTESTM %VMM(0), %VMM(0), %k2
531 CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
532 KMOV %k1, %VRCX
533 TESTEQ %VRCX
534 jnz L(return_vec_2)
535
536 VMOVU (VEC_SIZE * 3)(%rdi), %VMM(0)
537 VPTESTM %VMM(0), %VMM(0), %k2
538 CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
539 KMOV %k1, %VRCX
540 TESTEQ %VRCX
541 jnz L(return_vec_3)
542
543# ifdef USE_AS_STRNCMP
544 cmpq $(CHAR_PER_VEC * 2), %rdx
545 jbe L(ret_zero)
546# endif
547
548
549# ifdef USE_AS_WCSCMP
550 /* any non-zero positive value that doesn't inference with 0x1.
551 */
552 movl $2, %r8d
553
554# else
555 xorl %r8d, %r8d
556# endif
557
558 /* The prepare labels are various entry points from the page
559 cross logic. */
560L(prepare_loop):
561
562# ifdef USE_AS_STRNCMP
563# ifdef USE_AS_WCSCMP
564L(prepare_loop_no_len):
565 movl %edi, %ecx
566 andl $(VEC_SIZE * 4 - 1), %ecx
567 shrl $2, %ecx
568 leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
569# else
570 /* Store N + (VEC_SIZE * 4) and place check at the beginning of
571 the loop. */
572 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
573L(prepare_loop_no_len):
574# endif
575# else
576L(prepare_loop_no_len):
577# endif
578
579 /* Align s1 and adjust s2 accordingly. */
580 subq %rdi, %rsi
581 andq $-(VEC_SIZE * 4), %rdi
582L(prepare_loop_readj):
583 addq %rdi, %rsi
584# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
585 subq %rdi, %rdx
586# endif
587
588L(prepare_loop_aligned):
589 /* eax stores distance from rsi to next page cross. These cases
590 need to be handled specially as the 4x loop could potentially
591 read memory past the length of s1 or s2 and across a page
592 boundary. */
593 movl $-(VEC_SIZE * 4), %eax
594 subl %esi, %eax
595 andl $(PAGE_SIZE - 1), %eax
596
597
598 /* Loop 4x comparisons at a time. */
599 .p2align 4
600L(loop):
601
602 /* End condition for strncmp. */
603# ifdef USE_AS_STRNCMP
604 subq $(CHAR_PER_VEC * 4), %rdx
605 jbe L(ret_zero)
606# endif
607
608 subq $-(VEC_SIZE * 4), %rdi
609 subq $-(VEC_SIZE * 4), %rsi
610
611 /* Check if rsi loads will cross a page boundary. */
612 addl $-(VEC_SIZE * 4), %eax
613 jnb L(page_cross_during_loop)
614
615 /* Loop entry after handling page cross during loop. */
616L(loop_skip_page_cross_check):
617 VMOVA (VEC_SIZE * 0)(%rdi), %VMM(0)
618 VMOVA (VEC_SIZE * 1)(%rdi), %VMM(2)
619 VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4)
620 VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6)
621
622 VPMINU %VMM(0), %VMM(2), %VMM(8)
623 VPMINU %VMM(4), %VMM(6), %VMM(9)
624
625 /* A zero CHAR in YMM9 means that there is a null CHAR. */
626 VPMINU %VMM(8), %VMM(9), %VMM(9)
627
628 /* Each bit set in K1 represents a non-null CHAR in YMM9. */
629 VPTESTM %VMM(9), %VMM(9), %k1
630# ifndef USE_AS_STRCASECMP_L
631 vpxorq (VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
632 vpxorq (VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
633 vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
634 /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
635 oring with YMM1. Result is stored in YMM6. */
636 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
637# else
638 VMOVU (VEC_SIZE * 0)(%rsi), %VMM(1)
639 TOLOWER_VMM (%VMM(0), %VMM(1))
640 VMOVU (VEC_SIZE * 1)(%rsi), %VMM(3)
641 TOLOWER_VMM (%VMM(2), %VMM(3))
642 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5)
643 TOLOWER_VMM (%VMM(4), %VMM(5))
644 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
645 TOLOWER_VMM (%VMM(6), %VMM(7))
646 vpxorq %VMM(0), %VMM(1), %VMM(1)
647 vpxorq %VMM(2), %VMM(3), %VMM(3)
648 vpxorq %VMM(4), %VMM(5), %VMM(5)
649 vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
650# endif
651 /* Or together YMM3, YMM5, and YMM6. */
652 vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
653
654
655 /* A non-zero CHAR in YMM6 represents a mismatch. */
656 VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
657 KMOV %k0, %LOOP_REG
658
659 TESTEQ %LOOP_REG
660 jz L(loop)
661
662
663 /* Find which VEC has the mismatch of end of string. */
664 VPTESTM %VMM(0), %VMM(0), %k1
665 VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
666 KMOV %k0, %VRCX
667 TESTEQ %VRCX
668 jnz L(return_vec_0_end)
669
670 VPTESTM %VMM(2), %VMM(2), %k1
671 VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
672 KMOV %k0, %VRCX
673 TESTEQ %VRCX
674 jnz L(return_vec_1_end)
675
676
677 /* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
678 */
679L(return_vec_2_3_end):
680# ifdef USE_AS_STRNCMP
681 subq $(CHAR_PER_VEC * 2), %rdx
682 jbe L(ret_zero_end)
683# endif
684
685 VPTESTM %VMM(4), %VMM(4), %k1
686 VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
687 KMOV %k0, %VRCX
688 TESTEQ %VRCX
689# if CHAR_PER_VEC <= 16
690 sall $CHAR_PER_VEC, %LOOP_REG
691 orl %ecx, %LOOP_REG
692# elif CHAR_PER_VEC <= 32
693 salq $CHAR_PER_VEC, %LOOP_REG64
694 orq %rcx, %LOOP_REG64
695# else
696 /* We aren't combining last 2x VEC so branch on second the last.
697 */
698 jnz L(return_vec_2_end)
699# endif
700
701 /* LOOP_REG contains matches for null/mismatch from the loop. If
702 VEC 0,1,and 2 all have no null and no mismatches then
703 mismatch must entirely be from VEC 3 which is fully
704 represented by LOOP_REG. */
705# if CHAR_PER_VEC <= 16
706 bsf %LOOP_REG, %LOOP_REG
707# else
708 bsfq %LOOP_REG64, %LOOP_REG64
709# endif
710# ifdef USE_AS_STRNCMP
711
712 /* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
713 adj length before last comparison. */
714# if CHAR_PER_VEC == 64
715 subq $CHAR_PER_VEC, %rdx
716 jbe L(ret_zero_end)
717# endif
718
719 cmpq %LOOP_REG64, %rdx
720 jbe L(ret_zero_end)
721# endif
722
723# ifdef USE_AS_WCSCMP
724 movl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
725 xorl %eax, %eax
726 cmpl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
727 je L(ret5)
728 setl %al
729 negl %eax
730 xorl %r8d, %eax
731# else
732 movzbl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
733 movzbl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
734 TOLOWER_gpr (%rax, %eax)
735 TOLOWER_gpr (%rcx, %ecx)
736 subl %ecx, %eax
737 xorl %r8d, %eax
738 subl %r8d, %eax
739# endif
740L(ret5):
741 ret
742
743# ifdef USE_AS_STRNCMP
744 .p2align 4,, 2
745L(ret_zero_end):
746 xorl %eax, %eax
747 ret
748# endif
749
750
751
752 /* The L(return_vec_N_end) differ from L(return_vec_N) in that
753 they use the value of `r8` to negate the return value. This
754 is because the page cross logic can swap `rdi` and `rsi`.
755 */
756 .p2align 4,, 10
757# ifdef USE_AS_STRNCMP
758L(return_vec_1_end):
759# if CHAR_PER_VEC <= 32
760 /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
761 without additional branches by adjusting the bit positions
762 from VEC1. We can't do this for CHAR_PER_VEC == 64. */
763# if CHAR_PER_VEC <= 16
764 sall $CHAR_PER_VEC, %ecx
765# else
766 salq $CHAR_PER_VEC, %rcx
767# endif
768# else
769 /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
770 check it. */
771 bsf %VRCX, %VRCX
772 addl $(CHAR_PER_VEC), %ecx
773 cmpq %rcx, %rdx
774 ja L(ret_vec_0_end_finish)
775 xorl %eax, %eax
776 ret
777# endif
778# endif
779L(return_vec_0_end):
780# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
781 bsf %VRCX, %VRCX
782# else
783 bsfq %rcx, %rcx
784# endif
785
786# ifdef USE_AS_STRNCMP
787 cmpq %rcx, %rdx
788 jbe L(ret_zero_end)
789# endif
790
791L(ret_vec_0_end_finish):
792# ifdef USE_AS_WCSCMP
793 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
794 xorl %eax, %eax
795 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
796 je L(ret6)
797 setl %al
798 negl %eax
799 /* This is the non-zero case for `eax` so just xorl with `r8d`
800 flip is `rdi` and `rsi` where swapped. */
801 xorl %r8d, %eax
802# else
803 movzbl (%rdi, %rcx), %eax
804 movzbl (%rsi, %rcx), %ecx
805 TOLOWER_gpr (%rax, %eax)
806 TOLOWER_gpr (%rcx, %ecx)
807 subl %ecx, %eax
808 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
809 logic. Subtract `r8d` after xor for zero case. */
810 xorl %r8d, %eax
811 subl %r8d, %eax
812# endif
813L(ret6):
814 ret
815
816# ifndef USE_AS_STRNCMP
817 .p2align 4,, 10
818L(return_vec_1_end):
819 bsf %VRCX, %VRCX
820# ifdef USE_AS_WCSCMP
821 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
822 xorl %eax, %eax
823 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
824 je L(ret7)
825 setl %al
826 negl %eax
827 xorl %r8d, %eax
828# else
829 movzbl VEC_SIZE(%rdi, %rcx), %eax
830 movzbl VEC_SIZE(%rsi, %rcx), %ecx
831 TOLOWER_gpr (%rax, %eax)
832 TOLOWER_gpr (%rcx, %ecx)
833 subl %ecx, %eax
834 xorl %r8d, %eax
835 subl %r8d, %eax
836# endif
837L(ret7):
838 ret
839# endif
840
841
842 /* If CHAR_PER_VEC == 64 we can't combine matches from the last
843 2x VEC so need separate return label. */
844# if CHAR_PER_VEC == 64
845L(return_vec_2_end):
846 bsf %VRCX, %VRCX
847# ifdef USE_AS_STRNCMP
848 cmpq %rcx, %rdx
849 jbe L(ret_zero_end)
850# endif
851# ifdef USE_AS_WCSCMP
852 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
853 xorl %eax, %eax
854 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
855 je L(ret31)
856 setl %al
857 negl %eax
858 /* This is the non-zero case for `eax` so just xorl with `r8d`
859 flip is `rdi` and `rsi` where swapped. */
860 xorl %r8d, %eax
861# else
862 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
863 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
864 TOLOWER_gpr (%rax, %eax)
865 TOLOWER_gpr (%rcx, %ecx)
866 subl %ecx, %eax
867 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
868 logic. Subtract `r8d` after xor for zero case. */
869 xorl %r8d, %eax
870 subl %r8d, %eax
871# endif
872L(ret13):
873 ret
874# endif
875
876
877 /* Page cross in rsi in next 4x VEC. */
878
879 /* TODO: Improve logic here. */
880 .p2align 4,, 10
881L(page_cross_during_loop):
882 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
883
884 /* Optimistically rsi and rdi and both aligned in which case we
885 don't need any logic here. */
886 cmpl $-(VEC_SIZE * 4), %eax
887 /* Don't adjust eax before jumping back to loop and we will
888 never hit page cross case again. */
889 je L(loop_skip_page_cross_check)
890
891 /* Check if we can safely load a VEC. */
892 cmpl $-(VEC_SIZE * 3), %eax
893 jle L(less_1x_vec_till_page_cross)
894
895 VMOVA (%rdi), %VMM(0)
896 VPTESTM %VMM(0), %VMM(0), %k2
897 CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
898 KMOV %k1, %VRCX
899 TESTEQ %VRCX
900 jnz L(return_vec_0_end)
901
902 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
903 cmpl $-(VEC_SIZE * 2), %eax
904 jg L(more_2x_vec_till_page_cross)
905
906 .p2align 4,, 4
907L(less_1x_vec_till_page_cross):
908 subl $-(VEC_SIZE * 4), %eax
909 /* Guaranteed safe to read from rdi - VEC_SIZE here. The only
910 concerning case is first iteration if incoming s1 was near start
911 of a page and s2 near end. If s1 was near the start of the page
912 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
913 to read back -VEC_SIZE. If rdi is truly at the start of a page
914 here, it means the previous page (rdi - VEC_SIZE) has already
915 been loaded earlier so must be valid. */
916 VMOVU -VEC_SIZE(%rdi, %rax), %VMM(0)
917 VPTESTM %VMM(0), %VMM(0), %k2
918 CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
919 /* Mask of potentially valid bits. The lower bits can be out of
920 range comparisons (but safe regarding page crosses). */
921
922# ifdef USE_AS_WCSCMP
923 movl $-1, %r10d
924 movl %esi, %ecx
925 andl $(VEC_SIZE - 1), %ecx
926 shrl $2, %ecx
927 shlxl %ecx, %r10d, %ecx
928 /* Depending on CHAR_PER_VEC extract mask for possible in-bound
929 matches. */
930# if CHAR_PER_VEC == 16
931 movzwl %cx, %r10d
932# elif CHAR_PER_VEC == 8
933 movzbl %cl, %r10d
934# else
935# error "Invalid CHAR_SIZE or VEC_SIZE"
936# endif
937# else
938 mov $-1, %VRCX
939 shlx %VRSI, %VRCX, %VR10
940# endif
941
942 KMOV %k1, %VRCX
943 not %VRCX
944
945
946# ifdef USE_AS_STRNCMP
947# ifdef USE_AS_WCSCMP
948 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
949 safe. */
950 movl %eax, %r11d
951 shrl $2, %r11d
952 cmpq %r11, %rdx
953# else
954 cmpq %rax, %rdx
955# endif
956 jbe L(return_page_cross_end_check)
957# endif
958 movl %eax, %OFFSET_REG
959
960 /* Readjust eax before potentially returning to the loop. */
961 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
962
963 and %VR10, %VRCX
964 jz L(loop_skip_page_cross_check)
965
966 bsf %VRCX, %VRCX
967
968# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
969 leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
970L(return_page_cross_cmp_mem):
971# else
972 addl %OFFSET_REG, %ecx
973# endif
974# ifdef USE_AS_WCSCMP
975 movl VEC_OFFSET(%rdi, %rcx), %edx
976 xorl %eax, %eax
977 cmpl VEC_OFFSET(%rsi, %rcx), %edx
978 je L(ret8)
979 setl %al
980 negl %eax
981 xorl %r8d, %eax
982# else
983 movzbl VEC_OFFSET(%rdi, %rcx), %eax
984 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
985 TOLOWER_gpr (%rax, %eax)
986 TOLOWER_gpr (%rcx, %ecx)
987 subl %ecx, %eax
988 xorl %r8d, %eax
989 subl %r8d, %eax
990# endif
991L(ret8):
992 ret
993
994# ifdef USE_AS_STRNCMP
995 .p2align 4,, 10
996L(return_page_cross_end_check):
997 and %VR10, %VRCX
998 /* Need to use tzcnt here as VRCX may be zero. If VRCX is zero
999 tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
1000 guaranteed to be <= CHAR_PER_VEC so we will only use the return
1001 idx if VRCX was non-zero. */
1002 tzcnt %VRCX, %VRCX
1003 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1004# ifdef USE_AS_WCSCMP
1005 sall $2, %edx
1006# endif
1007 cmpl %ecx, %edx
1008 ja L(return_page_cross_cmp_mem)
1009 xorl %eax, %eax
1010 ret
1011# endif
1012
1013
1014 .p2align 4,, 10
1015L(more_2x_vec_till_page_cross):
1016 /* If more 2x vec till cross we will complete a full loop
1017 iteration here. */
1018
1019 VMOVA VEC_SIZE(%rdi), %VMM(0)
1020 VPTESTM %VMM(0), %VMM(0), %k2
1021 CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
1022 KMOV %k1, %VRCX
1023 TESTEQ %VRCX
1024 jnz L(return_vec_1_end)
1025
1026# ifdef USE_AS_STRNCMP
1027 cmpq $(CHAR_PER_VEC * 2), %rdx
1028 jbe L(ret_zero_in_loop_page_cross)
1029# endif
1030
1031 subl $-(VEC_SIZE * 4), %eax
1032
1033 /* Safe to include comparisons from lower bytes. */
1034 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
1035 VPTESTM %VMM(0), %VMM(0), %k2
1036 CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
1037 KMOV %k1, %VRCX
1038 TESTEQ %VRCX
1039 jnz L(return_vec_page_cross_0)
1040
1041 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
1042 VPTESTM %VMM(0), %VMM(0), %k2
1043 CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
1044 KMOV %k1, %VRCX
1045 TESTEQ %VRCX
1046 jnz L(return_vec_page_cross_1)
1047
1048# ifdef USE_AS_STRNCMP
1049 /* Must check length here as length might proclude reading next
1050 page. */
1051# ifdef USE_AS_WCSCMP
1052 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
1053 safe. */
1054 movl %eax, %r11d
1055 shrl $2, %r11d
1056 cmpq %r11, %rdx
1057# else
1058 cmpq %rax, %rdx
1059# endif
1060 jbe L(ret_zero_in_loop_page_cross)
1061# endif
1062
1063 /* Finish the loop. */
1064 VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4)
1065 VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6)
1066 VPMINU %VMM(4), %VMM(6), %VMM(9)
1067 VPTESTM %VMM(9), %VMM(9), %k1
1068# ifndef USE_AS_STRCASECMP_L
1069 vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
1070 /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
1071 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
1072# else
1073 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5)
1074 TOLOWER_VMM (%VMM(4), %VMM(5))
1075 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
1076 TOLOWER_VMM (%VMM(6), %VMM(7))
1077 vpxorq %VMM(4), %VMM(5), %VMM(5)
1078 vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
1079# endif
1080 VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
1081 KMOV %k0, %LOOP_REG
1082 TESTEQ %LOOP_REG
1083 jnz L(return_vec_2_3_end)
1084
1085 /* Best for code size to include ucond-jmp here. Would be faster
1086 if this case is hot to duplicate the L(return_vec_2_3_end)
1087 code as fall-through and have jump back to loop on mismatch
1088 comparison. */
1089 subq $-(VEC_SIZE * 4), %rdi
1090 subq $-(VEC_SIZE * 4), %rsi
1091 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
1092# ifdef USE_AS_STRNCMP
1093 subq $(CHAR_PER_VEC * 4), %rdx
1094 ja L(loop_skip_page_cross_check)
1095L(ret_zero_in_loop_page_cross):
1096 xorl %eax, %eax
1097 ret
1098# else
1099 jmp L(loop_skip_page_cross_check)
1100# endif
1101
1102
1103 .p2align 4,, 10
1104L(return_vec_page_cross_0):
1105 addl $-VEC_SIZE, %eax
1106L(return_vec_page_cross_1):
1107 bsf %VRCX, %VRCX
1108# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
1109 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1110# ifdef USE_AS_STRNCMP
1111# ifdef USE_AS_WCSCMP
1112 /* Must divide ecx instead of multiply rdx due to overflow. */
1113 movl %ecx, %eax
1114 shrl $2, %eax
1115 cmpq %rax, %rdx
1116# else
1117 cmpq %rcx, %rdx
1118# endif
1119 jbe L(ret_zero_in_loop_page_cross)
1120# endif
1121# else
1122 addl %eax, %ecx
1123# endif
1124
1125# ifdef USE_AS_WCSCMP
1126 movl VEC_OFFSET(%rdi, %rcx), %edx
1127 xorl %eax, %eax
1128 cmpl VEC_OFFSET(%rsi, %rcx), %edx
1129 je L(ret9)
1130 setl %al
1131 negl %eax
1132 xorl %r8d, %eax
1133# else
1134 movzbl VEC_OFFSET(%rdi, %rcx), %eax
1135 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
1136 TOLOWER_gpr (%rax, %eax)
1137 TOLOWER_gpr (%rcx, %ecx)
1138 subl %ecx, %eax
1139 xorl %r8d, %eax
1140 subl %r8d, %eax
1141# endif
1142L(ret9):
1143 ret
1144
1145
1146 .p2align 4,, 10
1147L(page_cross):
1148# ifndef USE_AS_STRNCMP
1149 /* If both are VEC aligned we don't need any special logic here.
1150 Only valid for strcmp where stop condition is guaranteed to
1151 be reachable by just reading memory. */
1152 testl $((VEC_SIZE - 1) << 20), %eax
1153 jz L(no_page_cross)
1154# endif
1155
1156 movl %edi, %eax
1157 movl %esi, %ecx
1158 andl $(PAGE_SIZE - 1), %eax
1159 andl $(PAGE_SIZE - 1), %ecx
1160
1161 xorl %OFFSET_REG, %OFFSET_REG
1162
1163 /* Check which is closer to page cross, s1 or s2. */
1164 cmpl %eax, %ecx
1165 jg L(page_cross_s2)
1166
1167 /* The previous page cross check has false positives. Check for
1168 true positive as page cross logic is very expensive. */
1169 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
1170 jbe L(no_page_cross)
1171
1172
1173 /* Set r8 to not interfere with normal return value (rdi and rsi
1174 did not swap). */
1175# ifdef USE_AS_WCSCMP
1176 /* any non-zero positive value that doesn't inference with 0x1.
1177 */
1178 movl $2, %r8d
1179# else
1180 xorl %r8d, %r8d
1181# endif
1182
1183 /* Check if less than 1x VEC till page cross. */
1184 subl $(VEC_SIZE * 3), %eax
1185 jg L(less_1x_vec_till_page)
1186
1187
1188 /* If more than 1x VEC till page cross, loop through safely
1189 loadable memory until within 1x VEC of page cross. */
1190 .p2align 4,, 8
1191L(page_cross_loop):
1192 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
1193 VPTESTM %VMM(0), %VMM(0), %k2
1194 CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
1195 KMOV %k1, %VRCX
1196 TESTEQ %VRCX
1197 jnz L(check_ret_vec_page_cross)
1198 addl $CHAR_PER_VEC, %OFFSET_REG
1199# ifdef USE_AS_STRNCMP
1200 cmpq %OFFSET_REG64, %rdx
1201 jbe L(ret_zero_page_cross)
1202# endif
1203 addl $VEC_SIZE, %eax
1204 jl L(page_cross_loop)
1205
1206# ifdef USE_AS_WCSCMP
1207 shrl $2, %eax
1208# endif
1209
1210
1211 subl %eax, %OFFSET_REG
1212 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed
1213 to not cross page so is safe to load. Since we have already
1214 loaded at least 1 VEC from rsi it is also guaranteed to be
1215 safe. */
1216 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
1217 VPTESTM %VMM(0), %VMM(0), %k2
1218 CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
1219
1220 KMOV %k1, %VRCX
1221# ifdef USE_AS_STRNCMP
1222 leal CHAR_PER_VEC(%OFFSET_REG64), %eax
1223 cmpq %rax, %rdx
1224 jbe L(check_ret_vec_page_cross2)
1225# ifdef USE_AS_WCSCMP
1226 addq $-(CHAR_PER_VEC * 2), %rdx
1227# else
1228 addq %rdi, %rdx
1229# endif
1230# endif
1231 TESTEQ %VRCX
1232 jz L(prepare_loop_no_len)
1233
1234 .p2align 4,, 4
1235L(ret_vec_page_cross):
1236# ifndef USE_AS_STRNCMP
1237L(check_ret_vec_page_cross):
1238# endif
1239 tzcnt %VRCX, %VRCX
1240 addl %OFFSET_REG, %ecx
1241L(ret_vec_page_cross_cont):
1242# ifdef USE_AS_WCSCMP
1243 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
1244 xorl %eax, %eax
1245 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
1246 je L(ret12)
1247 setl %al
1248 negl %eax
1249 xorl %r8d, %eax
1250# else
1251 movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
1252 movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
1253 TOLOWER_gpr (%rax, %eax)
1254 TOLOWER_gpr (%rcx, %ecx)
1255 subl %ecx, %eax
1256 xorl %r8d, %eax
1257 subl %r8d, %eax
1258# endif
1259L(ret12):
1260 ret
1261
1262
1263# ifdef USE_AS_STRNCMP
1264 .p2align 4,, 10
1265L(check_ret_vec_page_cross2):
1266 TESTEQ %VRCX
1267L(check_ret_vec_page_cross):
1268 tzcnt %VRCX, %VRCX
1269 addl %OFFSET_REG, %ecx
1270 cmpq %rcx, %rdx
1271 ja L(ret_vec_page_cross_cont)
1272 .p2align 4,, 2
1273L(ret_zero_page_cross):
1274 xorl %eax, %eax
1275 ret
1276# endif
1277
1278 .p2align 4,, 4
1279L(page_cross_s2):
1280 /* Ensure this is a true page cross. */
1281 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1282 jbe L(no_page_cross)
1283
1284
1285 movl %ecx, %eax
1286 movq %rdi, %rcx
1287 movq %rsi, %rdi
1288 movq %rcx, %rsi
1289
1290 /* set r8 to negate return value as rdi and rsi swapped. */
1291# ifdef USE_AS_WCSCMP
1292 movl $-4, %r8d
1293# else
1294 movl $-1, %r8d
1295# endif
1296 xorl %OFFSET_REG, %OFFSET_REG
1297
1298 /* Check if more than 1x VEC till page cross. */
1299 subl $(VEC_SIZE * 3), %eax
1300 jle L(page_cross_loop)
1301
1302 .p2align 4,, 6
1303L(less_1x_vec_till_page):
1304# ifdef USE_AS_WCSCMP
1305 shrl $2, %eax
1306# endif
1307
1308 /* Find largest load size we can use. VEC_SIZE == 64 only check
1309 if we can do a full ymm load. */
1310# if VEC_SIZE == 64
1311
1312 cmpl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
1313 ja L(less_32_till_page)
1314
1315
1316 /* Use 16 byte comparison. */
1317 VMOVU (%rdi), %VMM_256(0)
1318 VPTESTM %VMM_256(0), %VMM_256(0), %k2
1319 CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
1320 kmovd %k1, %ecx
1321# ifdef USE_AS_WCSCMP
1322 subl $0xff, %ecx
1323# else
1324 incl %ecx
1325# endif
1326 jnz L(check_ret_vec_page_cross)
1327 movl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
1328# ifdef USE_AS_STRNCMP
1329 cmpq %OFFSET_REG64, %rdx
1330 jbe L(ret_zero_page_cross_slow_case64)
1331 subl %eax, %OFFSET_REG
1332# else
1333 /* Explicit check for 32 byte alignment. */
1334 subl %eax, %OFFSET_REG
1335 jz L(prepare_loop)
1336# endif
1337 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
1338 VPTESTM %VMM_256(0), %VMM_256(0), %k2
1339 CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
1340 kmovd %k1, %ecx
1341# ifdef USE_AS_WCSCMP
1342 subl $0xff, %ecx
1343# else
1344 incl %ecx
1345# endif
1346 jnz L(check_ret_vec_page_cross)
1347# ifdef USE_AS_STRNCMP
1348 addl $(32 / SIZE_OF_CHAR), %OFFSET_REG
1349 subq %OFFSET_REG64, %rdx
1350 jbe L(ret_zero_page_cross_slow_case64)
1351 subq $-(CHAR_PER_VEC * 4), %rdx
1352
1353 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1354 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1355# else
1356 leaq (32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1357 leaq (32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1358# endif
1359 jmp L(prepare_loop_aligned)
1360
1361# ifdef USE_AS_STRNCMP
1362 .p2align 4,, 2
1363L(ret_zero_page_cross_slow_case64):
1364 xorl %eax, %eax
1365 ret
1366# endif
1367L(less_32_till_page):
1368# endif
1369
1370 /* Find largest load size we can use. */
1371 cmpl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
1372 ja L(less_16_till_page)
1373
1374 /* Use 16 byte comparison. */
1375 vmovdqu (%rdi), %xmm0
1376 VPTESTM %xmm0, %xmm0, %k2
1377 CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
1378 kmovd %k1, %ecx
1379# ifdef USE_AS_WCSCMP
1380 subl $0xf, %ecx
1381# else
1382 incw %cx
1383# endif
1384 jnz L(check_ret_vec_page_cross)
1385
1386 movl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
1387# ifdef USE_AS_STRNCMP
1388# if VEC_SIZE == 32
1389 cmpq %OFFSET_REG64, %rdx
1390# else
1391 cmpq $(16 / SIZE_OF_CHAR), %rdx
1392# endif
1393 jbe L(ret_zero_page_cross_slow_case0)
1394 subl %eax, %OFFSET_REG
1395# else
1396 /* Explicit check for 16 byte alignment. */
1397 subl %eax, %OFFSET_REG
1398 jz L(prepare_loop)
1399# endif
1400 vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1401 VPTESTM %xmm0, %xmm0, %k2
1402 CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
1403 kmovd %k1, %ecx
1404# ifdef USE_AS_WCSCMP
1405 subl $0xf, %ecx
1406# else
1407 incw %cx
1408# endif
1409 jnz L(check_ret_vec_page_cross)
1410# ifdef USE_AS_STRNCMP
1411 addl $(16 / SIZE_OF_CHAR), %OFFSET_REG
1412 subq %OFFSET_REG64, %rdx
1413 jbe L(ret_zero_page_cross_slow_case0)
1414 subq $-(CHAR_PER_VEC * 4), %rdx
1415
1416 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1417 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1418# else
1419 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1420 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1421# endif
1422 jmp L(prepare_loop_aligned)
1423
1424# ifdef USE_AS_STRNCMP
1425 .p2align 4,, 2
1426L(ret_zero_page_cross_slow_case0):
1427 xorl %eax, %eax
1428 ret
1429# endif
1430
1431
1432 .p2align 4,, 10
1433L(less_16_till_page):
1434 cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
1435 ja L(less_8_till_page)
1436
1437 /* Use 8 byte comparison. */
1438 vmovq (%rdi), %xmm0
1439 vmovq (%rsi), %xmm1
1440 VPTESTM %xmm0, %xmm0, %k2
1441 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1442 kmovd %k1, %ecx
1443# ifdef USE_AS_WCSCMP
1444 subl $0x3, %ecx
1445# else
1446 incb %cl
1447# endif
1448 jnz L(check_ret_vec_page_cross)
1449
1450
1451# ifdef USE_AS_STRNCMP
1452 cmpq $(8 / SIZE_OF_CHAR), %rdx
1453 jbe L(ret_zero_page_cross_slow_case0)
1454# endif
1455 movl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
1456 subl %eax, %OFFSET_REG
1457
1458 vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1459 vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1460 VPTESTM %xmm0, %xmm0, %k2
1461 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1462 kmovd %k1, %ecx
1463# ifdef USE_AS_WCSCMP
1464 subl $0x3, %ecx
1465# else
1466 incb %cl
1467# endif
1468 jnz L(check_ret_vec_page_cross)
1469
1470
1471# ifdef USE_AS_STRNCMP
1472 addl $(8 / SIZE_OF_CHAR), %OFFSET_REG
1473 subq %OFFSET_REG64, %rdx
1474 jbe L(ret_zero_page_cross_slow_case0)
1475 subq $-(CHAR_PER_VEC * 4), %rdx
1476
1477 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1478 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1479# else
1480 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1481 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1482# endif
1483 jmp L(prepare_loop_aligned)
1484
1485
1486
1487
1488 .p2align 4,, 10
1489L(less_8_till_page):
1490# ifdef USE_AS_WCSCMP
1491 /* If using wchar then this is the only check before we reach
1492 the page boundary. */
1493 movl (%rdi), %eax
1494 movl (%rsi), %ecx
1495 cmpl %ecx, %eax
1496 jnz L(ret_less_8_wcs)
1497# ifdef USE_AS_STRNCMP
1498 addq $-(CHAR_PER_VEC * 2), %rdx
1499 /* We already checked for len <= 1 so cannot hit that case here.
1500 */
1501# endif
1502 testl %eax, %eax
1503 jnz L(prepare_loop)
1504 ret
1505
1506 .p2align 4,, 8
1507L(ret_less_8_wcs):
1508 setl %OFFSET_REG8
1509 negl %OFFSET_REG
1510 movl %OFFSET_REG, %eax
1511 xorl %r8d, %eax
1512 ret
1513
1514# else
1515 cmpl $(VEC_SIZE - 4), %eax
1516 ja L(less_4_till_page)
1517
1518 vmovd (%rdi), %xmm0
1519 vmovd (%rsi), %xmm1
1520 VPTESTM %xmm0, %xmm0, %k2
1521 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1522 kmovd %k1, %ecx
1523 subl $0xf, %ecx
1524 jnz L(check_ret_vec_page_cross)
1525
1526# ifdef USE_AS_STRNCMP
1527 cmpq $4, %rdx
1528 jbe L(ret_zero_page_cross_slow_case1)
1529# endif
1530 movl $((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
1531 subl %eax, %OFFSET_REG
1532
1533 vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1534 vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1535 VPTESTM %xmm0, %xmm0, %k2
1536 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1537 kmovd %k1, %ecx
1538 subl $0xf, %ecx
1539 jnz L(check_ret_vec_page_cross)
1540# ifdef USE_AS_STRNCMP
1541 addl $(4 / SIZE_OF_CHAR), %OFFSET_REG
1542 subq %OFFSET_REG64, %rdx
1543 jbe L(ret_zero_page_cross_slow_case1)
1544 subq $-(CHAR_PER_VEC * 4), %rdx
1545
1546 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1547 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1548# else
1549 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1550 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1551# endif
1552 jmp L(prepare_loop_aligned)
1553
1554
1555# ifdef USE_AS_STRNCMP
1556 .p2align 4,, 2
1557L(ret_zero_page_cross_slow_case1):
1558 xorl %eax, %eax
1559 ret
1560# endif
1561
1562 .p2align 4,, 10
1563L(less_4_till_page):
1564 subq %rdi, %rsi
1565 /* Extremely slow byte comparison loop. */
1566L(less_4_loop):
1567 movzbl (%rdi), %eax
1568 movzbl (%rsi, %rdi), %ecx
1569 TOLOWER_gpr (%rax, %eax)
1570 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1571 subl %BYTE_LOOP_REG, %eax
1572 jnz L(ret_less_4_loop)
1573 testl %ecx, %ecx
1574 jz L(ret_zero_4_loop)
1575# ifdef USE_AS_STRNCMP
1576 decq %rdx
1577 jz L(ret_zero_4_loop)
1578# endif
1579 incq %rdi
1580 /* end condition is reach page boundary (rdi is aligned). */
1581 testb $(VEC_SIZE - 1), %dil
1582 jnz L(less_4_loop)
1583 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1584 addq $-(VEC_SIZE * 4), %rdi
1585# ifdef USE_AS_STRNCMP
1586 subq $-(CHAR_PER_VEC * 4), %rdx
1587# endif
1588 jmp L(prepare_loop_aligned)
1589
1590L(ret_zero_4_loop):
1591 xorl %eax, %eax
1592 ret
1593L(ret_less_4_loop):
1594 xorl %r8d, %eax
1595 subl %r8d, %eax
1596 ret
1597# endif
1598 cfi_endproc
1599 .size STRCMP, .-STRCMP
1600#endif
1601

source code of glibc/sysdeps/x86_64/multiarch/strcmp-evex.S