1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. |
2 | Copyright (C) 2018-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # if defined USE_AS_STRCASECMP_L |
24 | # include "locale-defines.h" |
25 | # endif |
26 | |
27 | # ifndef STRCMP |
28 | # define STRCMP __strcmp_avx2 |
29 | # endif |
30 | |
31 | # define PAGE_SIZE 4096 |
32 | |
33 | /* VEC_SIZE = Number of bytes in a ymm register. */ |
34 | # define VEC_SIZE 32 |
35 | |
36 | # define VMOVU vmovdqu |
37 | # define VMOVA vmovdqa |
38 | |
39 | # ifdef USE_AS_WCSCMP |
40 | /* Compare packed dwords. */ |
41 | # define VPCMPEQ vpcmpeqd |
42 | /* Compare packed dwords and store minimum. */ |
43 | # define VPMINU vpminud |
44 | /* 1 dword char == 4 bytes. */ |
45 | # define SIZE_OF_CHAR 4 |
46 | # else |
47 | /* Compare packed bytes. */ |
48 | # define VPCMPEQ vpcmpeqb |
49 | /* Compare packed bytes and store minimum. */ |
50 | # define VPMINU vpminub |
51 | /* 1 byte char == 1 byte. */ |
52 | # define SIZE_OF_CHAR 1 |
53 | # endif |
54 | |
55 | # ifdef USE_AS_STRNCMP |
56 | # define LOOP_REG r9d |
57 | # define LOOP_REG64 r9 |
58 | |
59 | # define OFFSET_REG8 r9b |
60 | # define OFFSET_REG r9d |
61 | # define OFFSET_REG64 r9 |
62 | # else |
63 | # define LOOP_REG edx |
64 | # define LOOP_REG64 rdx |
65 | |
66 | # define OFFSET_REG8 dl |
67 | # define OFFSET_REG edx |
68 | # define OFFSET_REG64 rdx |
69 | # endif |
70 | |
71 | # ifndef VZEROUPPER |
72 | # define VZEROUPPER vzeroupper |
73 | # endif |
74 | |
75 | # if defined USE_AS_STRNCMP |
76 | # define VEC_OFFSET 0 |
77 | # else |
78 | # define VEC_OFFSET (-VEC_SIZE) |
79 | # endif |
80 | |
81 | # ifdef USE_AS_STRCASECMP_L |
82 | # define BYTE_LOOP_REG OFFSET_REG |
83 | # else |
84 | # define BYTE_LOOP_REG ecx |
85 | # endif |
86 | |
87 | # ifdef USE_AS_STRCASECMP_L |
88 | # ifdef USE_AS_STRNCMP |
89 | # define STRCASECMP __strncasecmp_avx2 |
90 | # define LOCALE_REG rcx |
91 | # define LOCALE_REG_LP RCX_LP |
92 | # define STRCASECMP_NONASCII __strncasecmp_l_nonascii |
93 | # else |
94 | # define STRCASECMP __strcasecmp_avx2 |
95 | # define LOCALE_REG rdx |
96 | # define LOCALE_REG_LP RDX_LP |
97 | # define STRCASECMP_NONASCII __strcasecmp_l_nonascii |
98 | # endif |
99 | # endif |
100 | |
101 | # define xmmZERO xmm15 |
102 | # define ymmZERO ymm15 |
103 | |
104 | # define LCASE_MIN_ymm %ymm10 |
105 | # define LCASE_MAX_ymm %ymm11 |
106 | # define CASE_ADD_ymm %ymm12 |
107 | |
108 | # define LCASE_MIN_xmm %xmm10 |
109 | # define LCASE_MAX_xmm %xmm11 |
110 | # define CASE_ADD_xmm %xmm12 |
111 | |
112 | /* r11 is never use elsewhere so this is safe to maintain. */ |
113 | # define TOLOWER_BASE %r11 |
114 | |
115 | # ifndef SECTION |
116 | # define SECTION(p) p##.avx |
117 | # endif |
118 | |
119 | # ifdef USE_AS_STRCASECMP_L |
120 | # define REG(x, y) x ## y |
121 | # define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ |
122 | vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ |
123 | vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ |
124 | vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ |
125 | vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ |
126 | vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ |
127 | vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ |
128 | vpaddb REG(%ext, 8), reg1_in, reg1_out; \ |
129 | vpaddb REG(%ext, 9), reg2_in, reg2_out |
130 | |
131 | # define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst |
132 | # define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) |
133 | # define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) |
134 | |
135 | # define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ |
136 | TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ |
137 | VPCMPEQ scratch_reg, s2_reg, reg_out |
138 | |
139 | # define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ |
140 | VMOVU s2_mem, reg_out; \ |
141 | CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) |
142 | |
143 | # define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) |
144 | # define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) |
145 | |
146 | # define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) |
147 | # define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) |
148 | |
149 | # else |
150 | # define TOLOWER_gpr(...) |
151 | # define TOLOWER_ymm(...) |
152 | # define TOLOWER_xmm(...) |
153 | |
154 | # define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ |
155 | VPCMPEQ s2_reg, s1_reg, reg_out |
156 | |
157 | # define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) |
158 | |
159 | # define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) |
160 | # define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) |
161 | # endif |
162 | |
163 | /* Warning! |
164 | wcscmp/wcsncmp have to use SIGNED comparison for elements. |
165 | strcmp/strncmp have to use UNSIGNED comparison for elements. |
166 | */ |
167 | |
168 | /* The main idea of the string comparison (byte or dword) using AVX2 |
169 | consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on |
170 | either packed bytes or dwords depending on USE_AS_WCSCMP. In order |
171 | to check the null char, algorithm keeps the matched bytes/dwords, |
172 | requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, |
173 | the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and |
174 | one VPMINU instructions, together with movdqu and testl instructions. |
175 | Main loop (away from from page boundary) compares 4 vectors are a time, |
176 | effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. |
177 | |
178 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic |
179 | is the same as strcmp, except that an a maximum offset is tracked. If |
180 | the maximum offset is reached before a difference is found, zero is |
181 | returned. */ |
182 | |
183 | .section SECTION(.text), "ax" , @progbits |
184 | .align 16 |
185 | .type STRCMP, @function |
186 | .globl STRCMP |
187 | .hidden STRCMP |
188 | |
189 | # ifndef GLABEL |
190 | # define GLABEL(...) __VA_ARGS__ |
191 | # endif |
192 | |
193 | # ifdef USE_AS_STRCASECMP_L |
194 | ENTRY (GLABEL(STRCASECMP)) |
195 | movq __libc_tsd_LOCALE@gottpoff(%rip), %rax |
196 | mov %fs:(%rax), %LOCALE_REG_LP |
197 | |
198 | /* Either 1 or 5 bytes (dependeing if CET is enabled). */ |
199 | .p2align 4 |
200 | END (GLABEL(STRCASECMP)) |
201 | /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ |
202 | # endif |
203 | |
204 | .p2align 4 |
205 | STRCMP: |
206 | cfi_startproc |
207 | _CET_ENDBR |
208 | CALL_MCOUNT |
209 | |
210 | # if defined USE_AS_STRCASECMP_L |
211 | /* We have to fall back on the C implementation for locales with |
212 | encodings not matching ASCII for single bytes. */ |
213 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
214 | mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP |
215 | # else |
216 | mov (%LOCALE_REG), %RAX_LP |
217 | # endif |
218 | testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) |
219 | jne STRCASECMP_NONASCII |
220 | leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE |
221 | # endif |
222 | |
223 | # ifdef USE_AS_STRNCMP |
224 | /* Don't overwrite LOCALE_REG (rcx) until we have pass |
225 | L(one_or_less). Otherwise we might use the wrong locale in |
226 | the OVERFLOW_STRCMP (strcasecmp_l). */ |
227 | # ifdef __ILP32__ |
228 | /* Clear the upper 32 bits. */ |
229 | movl %edx, %edx |
230 | # endif |
231 | cmp $1, %RDX_LP |
232 | /* Signed comparison intentional. We use this branch to also |
233 | test cases where length >= 2^63. These very large sizes can be |
234 | handled with strcmp as there is no way for that length to |
235 | actually bound the buffer. */ |
236 | jle L(one_or_less) |
237 | # ifdef USE_AS_WCSCMP |
238 | movq %rdx, %rcx |
239 | |
240 | /* Multiplying length by sizeof(wchar_t) can result in overflow. |
241 | Check if that is possible. All cases where overflow are possible |
242 | are cases where length is large enough that it can never be a |
243 | bound on valid memory so just use wcscmp. */ |
244 | shrq $56, %rcx |
245 | jnz OVERFLOW_STRCMP |
246 | |
247 | leaq (, %rdx, 4), %rdx |
248 | # endif |
249 | # endif |
250 | vpxor %xmmZERO, %xmmZERO, %xmmZERO |
251 | # if defined USE_AS_STRCASECMP_L |
252 | .section .rodata.cst32, "aM" , @progbits, 32 |
253 | .align 32 |
254 | L(lcase_min): |
255 | .quad 0x3f3f3f3f3f3f3f3f |
256 | .quad 0x3f3f3f3f3f3f3f3f |
257 | .quad 0x3f3f3f3f3f3f3f3f |
258 | .quad 0x3f3f3f3f3f3f3f3f |
259 | L(lcase_max): |
260 | .quad 0x9999999999999999 |
261 | .quad 0x9999999999999999 |
262 | .quad 0x9999999999999999 |
263 | .quad 0x9999999999999999 |
264 | L(case_add): |
265 | .quad 0x2020202020202020 |
266 | .quad 0x2020202020202020 |
267 | .quad 0x2020202020202020 |
268 | .quad 0x2020202020202020 |
269 | .previous |
270 | |
271 | vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm |
272 | vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm |
273 | vmovdqa L(case_add)(%rip), CASE_ADD_ymm |
274 | # endif |
275 | movl %edi, %eax |
276 | orl %esi, %eax |
277 | sall $20, %eax |
278 | /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ |
279 | cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax |
280 | ja L(page_cross) |
281 | |
282 | L(no_page_cross): |
283 | /* Safe to compare 4x vectors. */ |
284 | VMOVU (%rdi), %ymm0 |
285 | /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. |
286 | Otherwise converts ymm0 and load from rsi to lower. ymm2 is |
287 | scratch and ymm1 is the return. */ |
288 | CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) |
289 | /* 1s at null CHAR. */ |
290 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
291 | /* 1s where s1 and s2 equal AND not null CHAR. */ |
292 | vpandn %ymm1, %ymm2, %ymm1 |
293 | |
294 | /* All 1s -> keep going, any 0s -> return. */ |
295 | vpmovmskb %ymm1, %ecx |
296 | # ifdef USE_AS_STRNCMP |
297 | cmpq $VEC_SIZE, %rdx |
298 | jbe L(vec_0_test_len) |
299 | # endif |
300 | |
301 | /* All 1s represents all equals. incl will overflow to zero in |
302 | all equals case. Otherwise 1s will carry until position of first |
303 | mismatch. */ |
304 | incl %ecx |
305 | jz L(more_3x_vec) |
306 | |
307 | .p2align 4,, 4 |
308 | L(return_vec_0): |
309 | tzcntl %ecx, %ecx |
310 | # ifdef USE_AS_WCSCMP |
311 | movl (%rdi, %rcx), %edx |
312 | xorl %eax, %eax |
313 | cmpl (%rsi, %rcx), %edx |
314 | je L(ret0) |
315 | setl %al |
316 | negl %eax |
317 | orl $1, %eax |
318 | # else |
319 | movzbl (%rdi, %rcx), %eax |
320 | movzbl (%rsi, %rcx), %ecx |
321 | TOLOWER_gpr (%rax, %eax) |
322 | TOLOWER_gpr (%rcx, %ecx) |
323 | subl %ecx, %eax |
324 | # endif |
325 | L(ret0): |
326 | L(return_vzeroupper): |
327 | ZERO_UPPER_VEC_REGISTERS_RETURN |
328 | |
329 | # ifdef USE_AS_STRNCMP |
330 | .p2align 4,, 8 |
331 | L(vec_0_test_len): |
332 | notl %ecx |
333 | bzhil %edx, %ecx, %eax |
334 | jnz L(return_vec_0) |
335 | /* Align if will cross fetch block. */ |
336 | .p2align 4,, 2 |
337 | L(ret_zero): |
338 | xorl %eax, %eax |
339 | VZEROUPPER_RETURN |
340 | |
341 | .p2align 4,, 5 |
342 | L(one_or_less): |
343 | # ifdef USE_AS_STRCASECMP_L |
344 | /* Set locale argument for strcasecmp. */ |
345 | movq %LOCALE_REG, %rdx |
346 | # endif |
347 | jb L(ret_zero) |
348 | /* 'nbe' covers the case where length is negative (large |
349 | unsigned). */ |
350 | jnbe OVERFLOW_STRCMP |
351 | # ifdef USE_AS_WCSCMP |
352 | movl (%rdi), %edx |
353 | xorl %eax, %eax |
354 | cmpl (%rsi), %edx |
355 | je L(ret1) |
356 | setl %al |
357 | negl %eax |
358 | orl $1, %eax |
359 | # else |
360 | movzbl (%rdi), %eax |
361 | movzbl (%rsi), %ecx |
362 | TOLOWER_gpr (%rax, %eax) |
363 | TOLOWER_gpr (%rcx, %ecx) |
364 | subl %ecx, %eax |
365 | # endif |
366 | L(ret1): |
367 | ret |
368 | # endif |
369 | |
370 | .p2align 4,, 10 |
371 | L(return_vec_1): |
372 | tzcntl %ecx, %ecx |
373 | # ifdef USE_AS_STRNCMP |
374 | /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of |
375 | overflow. */ |
376 | addq $-VEC_SIZE, %rdx |
377 | cmpq %rcx, %rdx |
378 | jbe L(ret_zero) |
379 | # endif |
380 | # ifdef USE_AS_WCSCMP |
381 | movl VEC_SIZE(%rdi, %rcx), %edx |
382 | xorl %eax, %eax |
383 | cmpl VEC_SIZE(%rsi, %rcx), %edx |
384 | je L(ret2) |
385 | setl %al |
386 | negl %eax |
387 | orl $1, %eax |
388 | # else |
389 | movzbl VEC_SIZE(%rdi, %rcx), %eax |
390 | movzbl VEC_SIZE(%rsi, %rcx), %ecx |
391 | TOLOWER_gpr (%rax, %eax) |
392 | TOLOWER_gpr (%rcx, %ecx) |
393 | subl %ecx, %eax |
394 | # endif |
395 | L(ret2): |
396 | VZEROUPPER_RETURN |
397 | |
398 | .p2align 4,, 10 |
399 | # ifdef USE_AS_STRNCMP |
400 | L(return_vec_3): |
401 | salq $32, %rcx |
402 | # endif |
403 | |
404 | L(return_vec_2): |
405 | # ifndef USE_AS_STRNCMP |
406 | tzcntl %ecx, %ecx |
407 | # else |
408 | tzcntq %rcx, %rcx |
409 | cmpq %rcx, %rdx |
410 | jbe L(ret_zero) |
411 | # endif |
412 | |
413 | # ifdef USE_AS_WCSCMP |
414 | movl (VEC_SIZE * 2)(%rdi, %rcx), %edx |
415 | xorl %eax, %eax |
416 | cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx |
417 | je L(ret3) |
418 | setl %al |
419 | negl %eax |
420 | orl $1, %eax |
421 | # else |
422 | movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax |
423 | movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx |
424 | TOLOWER_gpr (%rax, %eax) |
425 | TOLOWER_gpr (%rcx, %ecx) |
426 | subl %ecx, %eax |
427 | # endif |
428 | L(ret3): |
429 | VZEROUPPER_RETURN |
430 | |
431 | # ifndef USE_AS_STRNCMP |
432 | .p2align 4,, 10 |
433 | L(return_vec_3): |
434 | tzcntl %ecx, %ecx |
435 | # ifdef USE_AS_WCSCMP |
436 | movl (VEC_SIZE * 3)(%rdi, %rcx), %edx |
437 | xorl %eax, %eax |
438 | cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx |
439 | je L(ret4) |
440 | setl %al |
441 | negl %eax |
442 | orl $1, %eax |
443 | # else |
444 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
445 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx |
446 | TOLOWER_gpr (%rax, %eax) |
447 | TOLOWER_gpr (%rcx, %ecx) |
448 | subl %ecx, %eax |
449 | # endif |
450 | L(ret4): |
451 | VZEROUPPER_RETURN |
452 | # endif |
453 | |
454 | .p2align 4,, 10 |
455 | L(more_3x_vec): |
456 | /* Safe to compare 4x vectors. */ |
457 | VMOVU VEC_SIZE(%rdi), %ymm0 |
458 | CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) |
459 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
460 | vpandn %ymm1, %ymm2, %ymm1 |
461 | vpmovmskb %ymm1, %ecx |
462 | incl %ecx |
463 | jnz L(return_vec_1) |
464 | |
465 | # ifdef USE_AS_STRNCMP |
466 | subq $(VEC_SIZE * 2), %rdx |
467 | jbe L(ret_zero) |
468 | # endif |
469 | |
470 | VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 |
471 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) |
472 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
473 | vpandn %ymm1, %ymm2, %ymm1 |
474 | vpmovmskb %ymm1, %ecx |
475 | incl %ecx |
476 | jnz L(return_vec_2) |
477 | |
478 | VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 |
479 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) |
480 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
481 | vpandn %ymm1, %ymm2, %ymm1 |
482 | vpmovmskb %ymm1, %ecx |
483 | incl %ecx |
484 | jnz L(return_vec_3) |
485 | |
486 | # ifdef USE_AS_STRNCMP |
487 | cmpq $(VEC_SIZE * 2), %rdx |
488 | jbe L(ret_zero) |
489 | # endif |
490 | |
491 | # ifdef USE_AS_WCSCMP |
492 | /* any non-zero positive value that doesn't inference with 0x1. |
493 | */ |
494 | movl $2, %r8d |
495 | |
496 | # else |
497 | xorl %r8d, %r8d |
498 | # endif |
499 | |
500 | /* The prepare labels are various entry points from the page |
501 | cross logic. */ |
502 | L(prepare_loop): |
503 | |
504 | # ifdef USE_AS_STRNCMP |
505 | /* Store N + (VEC_SIZE * 4) and place check at the begining of |
506 | the loop. */ |
507 | leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx |
508 | # endif |
509 | L(prepare_loop_no_len): |
510 | |
511 | /* Align s1 and adjust s2 accordingly. */ |
512 | subq %rdi, %rsi |
513 | andq $-(VEC_SIZE * 4), %rdi |
514 | addq %rdi, %rsi |
515 | |
516 | # ifdef USE_AS_STRNCMP |
517 | subq %rdi, %rdx |
518 | # endif |
519 | |
520 | L(prepare_loop_aligned): |
521 | /* eax stores distance from rsi to next page cross. These cases |
522 | need to be handled specially as the 4x loop could potentially |
523 | read memory past the length of s1 or s2 and across a page |
524 | boundary. */ |
525 | movl $-(VEC_SIZE * 4), %eax |
526 | subl %esi, %eax |
527 | andl $(PAGE_SIZE - 1), %eax |
528 | |
529 | /* Loop 4x comparisons at a time. */ |
530 | .p2align 4 |
531 | L(loop): |
532 | |
533 | /* End condition for strncmp. */ |
534 | # ifdef USE_AS_STRNCMP |
535 | subq $(VEC_SIZE * 4), %rdx |
536 | jbe L(ret_zero) |
537 | # endif |
538 | |
539 | subq $-(VEC_SIZE * 4), %rdi |
540 | subq $-(VEC_SIZE * 4), %rsi |
541 | |
542 | /* Check if rsi loads will cross a page boundary. */ |
543 | addl $-(VEC_SIZE * 4), %eax |
544 | jnb L(page_cross_during_loop) |
545 | |
546 | /* Loop entry after handling page cross during loop. */ |
547 | L(loop_skip_page_cross_check): |
548 | VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 |
549 | VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 |
550 | VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 |
551 | VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 |
552 | |
553 | /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ |
554 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) |
555 | CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) |
556 | CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) |
557 | CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) |
558 | |
559 | /* If any mismatches or null CHAR then 0 CHAR, otherwise non- |
560 | zero. */ |
561 | vpand %ymm0, %ymm1, %ymm1 |
562 | |
563 | |
564 | vpand %ymm2, %ymm3, %ymm3 |
565 | vpand %ymm4, %ymm5, %ymm5 |
566 | vpand %ymm6, %ymm7, %ymm7 |
567 | |
568 | VPMINU %ymm1, %ymm3, %ymm3 |
569 | VPMINU %ymm5, %ymm7, %ymm7 |
570 | |
571 | /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ |
572 | VPMINU %ymm3, %ymm7, %ymm7 |
573 | |
574 | /* If any 0 CHAR then done. */ |
575 | VPCMPEQ %ymm7, %ymmZERO, %ymm7 |
576 | vpmovmskb %ymm7, %LOOP_REG |
577 | testl %LOOP_REG, %LOOP_REG |
578 | jz L(loop) |
579 | |
580 | /* Find which VEC has the mismatch of end of string. */ |
581 | VPCMPEQ %ymm1, %ymmZERO, %ymm1 |
582 | vpmovmskb %ymm1, %ecx |
583 | testl %ecx, %ecx |
584 | jnz L(return_vec_0_end) |
585 | |
586 | |
587 | VPCMPEQ %ymm3, %ymmZERO, %ymm3 |
588 | vpmovmskb %ymm3, %ecx |
589 | testl %ecx, %ecx |
590 | jnz L(return_vec_1_end) |
591 | |
592 | L(return_vec_2_3_end): |
593 | # ifdef USE_AS_STRNCMP |
594 | subq $(VEC_SIZE * 2), %rdx |
595 | jbe L(ret_zero_end) |
596 | # endif |
597 | |
598 | VPCMPEQ %ymm5, %ymmZERO, %ymm5 |
599 | vpmovmskb %ymm5, %ecx |
600 | testl %ecx, %ecx |
601 | jnz L(return_vec_2_end) |
602 | |
603 | /* LOOP_REG contains matches for null/mismatch from the loop. If |
604 | VEC 0,1,and 2 all have no null and no mismatches then mismatch |
605 | must entirely be from VEC 3 which is fully represented by |
606 | LOOP_REG. */ |
607 | tzcntl %LOOP_REG, %LOOP_REG |
608 | |
609 | # ifdef USE_AS_STRNCMP |
610 | subl $-(VEC_SIZE), %LOOP_REG |
611 | cmpq %LOOP_REG64, %rdx |
612 | jbe L(ret_zero_end) |
613 | # endif |
614 | |
615 | # ifdef USE_AS_WCSCMP |
616 | movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx |
617 | xorl %eax, %eax |
618 | cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx |
619 | je L(ret5) |
620 | setl %al |
621 | negl %eax |
622 | xorl %r8d, %eax |
623 | # else |
624 | movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax |
625 | movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx |
626 | TOLOWER_gpr (%rax, %eax) |
627 | TOLOWER_gpr (%rcx, %ecx) |
628 | subl %ecx, %eax |
629 | xorl %r8d, %eax |
630 | subl %r8d, %eax |
631 | # endif |
632 | L(ret5): |
633 | VZEROUPPER_RETURN |
634 | |
635 | # ifdef USE_AS_STRNCMP |
636 | .p2align 4,, 2 |
637 | L(ret_zero_end): |
638 | xorl %eax, %eax |
639 | VZEROUPPER_RETURN |
640 | # endif |
641 | |
642 | |
643 | /* The L(return_vec_N_end) differ from L(return_vec_N) in that |
644 | they use the value of `r8` to negate the return value. This is |
645 | because the page cross logic can swap `rdi` and `rsi`. */ |
646 | .p2align 4,, 10 |
647 | # ifdef USE_AS_STRNCMP |
648 | L(return_vec_1_end): |
649 | salq $32, %rcx |
650 | # endif |
651 | L(return_vec_0_end): |
652 | # ifndef USE_AS_STRNCMP |
653 | tzcntl %ecx, %ecx |
654 | # else |
655 | tzcntq %rcx, %rcx |
656 | cmpq %rcx, %rdx |
657 | jbe L(ret_zero_end) |
658 | # endif |
659 | |
660 | # ifdef USE_AS_WCSCMP |
661 | movl (%rdi, %rcx), %edx |
662 | xorl %eax, %eax |
663 | cmpl (%rsi, %rcx), %edx |
664 | je L(ret6) |
665 | setl %al |
666 | negl %eax |
667 | xorl %r8d, %eax |
668 | # else |
669 | movzbl (%rdi, %rcx), %eax |
670 | movzbl (%rsi, %rcx), %ecx |
671 | TOLOWER_gpr (%rax, %eax) |
672 | TOLOWER_gpr (%rcx, %ecx) |
673 | subl %ecx, %eax |
674 | xorl %r8d, %eax |
675 | subl %r8d, %eax |
676 | # endif |
677 | L(ret6): |
678 | VZEROUPPER_RETURN |
679 | |
680 | # ifndef USE_AS_STRNCMP |
681 | .p2align 4,, 10 |
682 | L(return_vec_1_end): |
683 | tzcntl %ecx, %ecx |
684 | # ifdef USE_AS_WCSCMP |
685 | movl VEC_SIZE(%rdi, %rcx), %edx |
686 | xorl %eax, %eax |
687 | cmpl VEC_SIZE(%rsi, %rcx), %edx |
688 | je L(ret7) |
689 | setl %al |
690 | negl %eax |
691 | xorl %r8d, %eax |
692 | # else |
693 | movzbl VEC_SIZE(%rdi, %rcx), %eax |
694 | movzbl VEC_SIZE(%rsi, %rcx), %ecx |
695 | TOLOWER_gpr (%rax, %eax) |
696 | TOLOWER_gpr (%rcx, %ecx) |
697 | subl %ecx, %eax |
698 | xorl %r8d, %eax |
699 | subl %r8d, %eax |
700 | # endif |
701 | L(ret7): |
702 | VZEROUPPER_RETURN |
703 | # endif |
704 | |
705 | .p2align 4,, 10 |
706 | L(return_vec_2_end): |
707 | tzcntl %ecx, %ecx |
708 | # ifdef USE_AS_STRNCMP |
709 | cmpq %rcx, %rdx |
710 | jbe L(ret_zero_page_cross) |
711 | # endif |
712 | # ifdef USE_AS_WCSCMP |
713 | movl (VEC_SIZE * 2)(%rdi, %rcx), %edx |
714 | xorl %eax, %eax |
715 | cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx |
716 | je L(ret11) |
717 | setl %al |
718 | negl %eax |
719 | xorl %r8d, %eax |
720 | # else |
721 | movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax |
722 | movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx |
723 | TOLOWER_gpr (%rax, %eax) |
724 | TOLOWER_gpr (%rcx, %ecx) |
725 | subl %ecx, %eax |
726 | xorl %r8d, %eax |
727 | subl %r8d, %eax |
728 | # endif |
729 | L(ret11): |
730 | VZEROUPPER_RETURN |
731 | |
732 | |
733 | /* Page cross in rsi in next 4x VEC. */ |
734 | |
735 | /* TODO: Improve logic here. */ |
736 | .p2align 4,, 10 |
737 | L(page_cross_during_loop): |
738 | /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ |
739 | |
740 | /* Optimistically rsi and rdi and both aligned inwhich case we |
741 | don't need any logic here. */ |
742 | cmpl $-(VEC_SIZE * 4), %eax |
743 | /* Don't adjust eax before jumping back to loop and we will |
744 | never hit page cross case again. */ |
745 | je L(loop_skip_page_cross_check) |
746 | |
747 | /* Check if we can safely load a VEC. */ |
748 | cmpl $-(VEC_SIZE * 3), %eax |
749 | jle L(less_1x_vec_till_page_cross) |
750 | |
751 | VMOVA (%rdi), %ymm0 |
752 | CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) |
753 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
754 | vpandn %ymm1, %ymm2, %ymm1 |
755 | vpmovmskb %ymm1, %ecx |
756 | incl %ecx |
757 | jnz L(return_vec_0_end) |
758 | |
759 | /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ |
760 | cmpl $-(VEC_SIZE * 2), %eax |
761 | jg L(more_2x_vec_till_page_cross) |
762 | |
763 | .p2align 4,, 4 |
764 | L(less_1x_vec_till_page_cross): |
765 | subl $-(VEC_SIZE * 4), %eax |
766 | /* Guranteed safe to read from rdi - VEC_SIZE here. The only |
767 | concerning case is first iteration if incoming s1 was near start |
768 | of a page and s2 near end. If s1 was near the start of the page |
769 | we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe |
770 | to read back -VEC_SIZE. If rdi is truly at the start of a page |
771 | here, it means the previous page (rdi - VEC_SIZE) has already |
772 | been loaded earlier so must be valid. */ |
773 | VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 |
774 | CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) |
775 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
776 | vpandn %ymm1, %ymm2, %ymm1 |
777 | vpmovmskb %ymm1, %ecx |
778 | |
779 | /* Mask of potentially valid bits. The lower bits can be out of |
780 | range comparisons (but safe regarding page crosses). */ |
781 | movl $-1, %r10d |
782 | shlxl %esi, %r10d, %r10d |
783 | notl %ecx |
784 | |
785 | # ifdef USE_AS_STRNCMP |
786 | cmpq %rax, %rdx |
787 | jbe L(return_page_cross_end_check) |
788 | # endif |
789 | movl %eax, %OFFSET_REG |
790 | addl $(PAGE_SIZE - VEC_SIZE * 4), %eax |
791 | |
792 | andl %r10d, %ecx |
793 | jz L(loop_skip_page_cross_check) |
794 | |
795 | .p2align 4,, 3 |
796 | L(return_page_cross_end): |
797 | tzcntl %ecx, %ecx |
798 | |
799 | # ifdef USE_AS_STRNCMP |
800 | leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx |
801 | L(return_page_cross_cmp_mem): |
802 | # else |
803 | addl %OFFSET_REG, %ecx |
804 | # endif |
805 | # ifdef USE_AS_WCSCMP |
806 | movl VEC_OFFSET(%rdi, %rcx), %edx |
807 | xorl %eax, %eax |
808 | cmpl VEC_OFFSET(%rsi, %rcx), %edx |
809 | je L(ret8) |
810 | setl %al |
811 | negl %eax |
812 | xorl %r8d, %eax |
813 | # else |
814 | movzbl VEC_OFFSET(%rdi, %rcx), %eax |
815 | movzbl VEC_OFFSET(%rsi, %rcx), %ecx |
816 | TOLOWER_gpr (%rax, %eax) |
817 | TOLOWER_gpr (%rcx, %ecx) |
818 | subl %ecx, %eax |
819 | xorl %r8d, %eax |
820 | subl %r8d, %eax |
821 | # endif |
822 | L(ret8): |
823 | VZEROUPPER_RETURN |
824 | |
825 | # ifdef USE_AS_STRNCMP |
826 | .p2align 4,, 10 |
827 | L(return_page_cross_end_check): |
828 | andl %r10d, %ecx |
829 | tzcntl %ecx, %ecx |
830 | leal -VEC_SIZE(%rax, %rcx), %ecx |
831 | cmpl %ecx, %edx |
832 | ja L(return_page_cross_cmp_mem) |
833 | xorl %eax, %eax |
834 | VZEROUPPER_RETURN |
835 | # endif |
836 | |
837 | |
838 | .p2align 4,, 10 |
839 | L(more_2x_vec_till_page_cross): |
840 | /* If more 2x vec till cross we will complete a full loop |
841 | iteration here. */ |
842 | |
843 | VMOVU VEC_SIZE(%rdi), %ymm0 |
844 | CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) |
845 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
846 | vpandn %ymm1, %ymm2, %ymm1 |
847 | vpmovmskb %ymm1, %ecx |
848 | incl %ecx |
849 | jnz L(return_vec_1_end) |
850 | |
851 | # ifdef USE_AS_STRNCMP |
852 | cmpq $(VEC_SIZE * 2), %rdx |
853 | jbe L(ret_zero_in_loop_page_cross) |
854 | # endif |
855 | |
856 | subl $-(VEC_SIZE * 4), %eax |
857 | |
858 | /* Safe to include comparisons from lower bytes. */ |
859 | VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 |
860 | CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) |
861 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
862 | vpandn %ymm1, %ymm2, %ymm1 |
863 | vpmovmskb %ymm1, %ecx |
864 | incl %ecx |
865 | jnz L(return_vec_page_cross_0) |
866 | |
867 | VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 |
868 | CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) |
869 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
870 | vpandn %ymm1, %ymm2, %ymm1 |
871 | vpmovmskb %ymm1, %ecx |
872 | incl %ecx |
873 | jnz L(return_vec_page_cross_1) |
874 | |
875 | # ifdef USE_AS_STRNCMP |
876 | /* Must check length here as length might proclude reading next |
877 | page. */ |
878 | cmpq %rax, %rdx |
879 | jbe L(ret_zero_in_loop_page_cross) |
880 | # endif |
881 | |
882 | /* Finish the loop. */ |
883 | VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 |
884 | VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 |
885 | |
886 | CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) |
887 | CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) |
888 | vpand %ymm4, %ymm5, %ymm5 |
889 | vpand %ymm6, %ymm7, %ymm7 |
890 | VPMINU %ymm5, %ymm7, %ymm7 |
891 | VPCMPEQ %ymm7, %ymmZERO, %ymm7 |
892 | vpmovmskb %ymm7, %LOOP_REG |
893 | testl %LOOP_REG, %LOOP_REG |
894 | jnz L(return_vec_2_3_end) |
895 | |
896 | /* Best for code size to include ucond-jmp here. Would be faster |
897 | if this case is hot to duplicate the L(return_vec_2_3_end) code |
898 | as fall-through and have jump back to loop on mismatch |
899 | comparison. */ |
900 | subq $-(VEC_SIZE * 4), %rdi |
901 | subq $-(VEC_SIZE * 4), %rsi |
902 | addl $(PAGE_SIZE - VEC_SIZE * 8), %eax |
903 | # ifdef USE_AS_STRNCMP |
904 | subq $(VEC_SIZE * 4), %rdx |
905 | ja L(loop_skip_page_cross_check) |
906 | L(ret_zero_in_loop_page_cross): |
907 | xorl %eax, %eax |
908 | VZEROUPPER_RETURN |
909 | # else |
910 | jmp L(loop_skip_page_cross_check) |
911 | # endif |
912 | |
913 | |
914 | .p2align 4,, 10 |
915 | L(return_vec_page_cross_0): |
916 | addl $-VEC_SIZE, %eax |
917 | L(return_vec_page_cross_1): |
918 | tzcntl %ecx, %ecx |
919 | # ifdef USE_AS_STRNCMP |
920 | leal -VEC_SIZE(%rax, %rcx), %ecx |
921 | cmpq %rcx, %rdx |
922 | jbe L(ret_zero_in_loop_page_cross) |
923 | # else |
924 | addl %eax, %ecx |
925 | # endif |
926 | |
927 | # ifdef USE_AS_WCSCMP |
928 | movl VEC_OFFSET(%rdi, %rcx), %edx |
929 | xorl %eax, %eax |
930 | cmpl VEC_OFFSET(%rsi, %rcx), %edx |
931 | je L(ret9) |
932 | setl %al |
933 | negl %eax |
934 | xorl %r8d, %eax |
935 | # else |
936 | movzbl VEC_OFFSET(%rdi, %rcx), %eax |
937 | movzbl VEC_OFFSET(%rsi, %rcx), %ecx |
938 | TOLOWER_gpr (%rax, %eax) |
939 | TOLOWER_gpr (%rcx, %ecx) |
940 | subl %ecx, %eax |
941 | xorl %r8d, %eax |
942 | subl %r8d, %eax |
943 | # endif |
944 | L(ret9): |
945 | VZEROUPPER_RETURN |
946 | |
947 | |
948 | .p2align 4,, 10 |
949 | L(page_cross): |
950 | # ifndef USE_AS_STRNCMP |
951 | /* If both are VEC aligned we don't need any special logic here. |
952 | Only valid for strcmp where stop condition is guranteed to be |
953 | reachable by just reading memory. */ |
954 | testl $((VEC_SIZE - 1) << 20), %eax |
955 | jz L(no_page_cross) |
956 | # endif |
957 | |
958 | movl %edi, %eax |
959 | movl %esi, %ecx |
960 | andl $(PAGE_SIZE - 1), %eax |
961 | andl $(PAGE_SIZE - 1), %ecx |
962 | |
963 | xorl %OFFSET_REG, %OFFSET_REG |
964 | |
965 | /* Check which is closer to page cross, s1 or s2. */ |
966 | cmpl %eax, %ecx |
967 | jg L(page_cross_s2) |
968 | |
969 | /* The previous page cross check has false positives. Check for |
970 | true positive as page cross logic is very expensive. */ |
971 | subl $(PAGE_SIZE - VEC_SIZE * 4), %eax |
972 | jbe L(no_page_cross) |
973 | |
974 | /* Set r8 to not interfere with normal return value (rdi and rsi |
975 | did not swap). */ |
976 | # ifdef USE_AS_WCSCMP |
977 | /* any non-zero positive value that doesn't inference with 0x1. |
978 | */ |
979 | movl $2, %r8d |
980 | # else |
981 | xorl %r8d, %r8d |
982 | # endif |
983 | |
984 | /* Check if less than 1x VEC till page cross. */ |
985 | subl $(VEC_SIZE * 3), %eax |
986 | jg L(less_1x_vec_till_page) |
987 | |
988 | /* If more than 1x VEC till page cross, loop throuh safely |
989 | loadable memory until within 1x VEC of page cross. */ |
990 | |
991 | .p2align 4,, 10 |
992 | L(page_cross_loop): |
993 | |
994 | VMOVU (%rdi, %OFFSET_REG64), %ymm0 |
995 | CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) |
996 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
997 | vpandn %ymm1, %ymm2, %ymm1 |
998 | vpmovmskb %ymm1, %ecx |
999 | incl %ecx |
1000 | |
1001 | jnz L(check_ret_vec_page_cross) |
1002 | addl $VEC_SIZE, %OFFSET_REG |
1003 | # ifdef USE_AS_STRNCMP |
1004 | cmpq %OFFSET_REG64, %rdx |
1005 | jbe L(ret_zero_page_cross) |
1006 | # endif |
1007 | addl $VEC_SIZE, %eax |
1008 | jl L(page_cross_loop) |
1009 | |
1010 | subl %eax, %OFFSET_REG |
1011 | /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed |
1012 | to not cross page so is safe to load. Since we have already |
1013 | loaded at least 1 VEC from rsi it is also guranteed to be |
1014 | safe. */ |
1015 | |
1016 | VMOVU (%rdi, %OFFSET_REG64), %ymm0 |
1017 | CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) |
1018 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
1019 | vpandn %ymm1, %ymm2, %ymm1 |
1020 | vpmovmskb %ymm1, %ecx |
1021 | |
1022 | # ifdef USE_AS_STRNCMP |
1023 | leal VEC_SIZE(%OFFSET_REG64), %eax |
1024 | cmpq %rax, %rdx |
1025 | jbe L(check_ret_vec_page_cross2) |
1026 | addq %rdi, %rdx |
1027 | # endif |
1028 | incl %ecx |
1029 | jz L(prepare_loop_no_len) |
1030 | |
1031 | .p2align 4,, 4 |
1032 | L(ret_vec_page_cross): |
1033 | # ifndef USE_AS_STRNCMP |
1034 | L(check_ret_vec_page_cross): |
1035 | # endif |
1036 | tzcntl %ecx, %ecx |
1037 | addl %OFFSET_REG, %ecx |
1038 | L(ret_vec_page_cross_cont): |
1039 | # ifdef USE_AS_WCSCMP |
1040 | movl (%rdi, %rcx), %edx |
1041 | xorl %eax, %eax |
1042 | cmpl (%rsi, %rcx), %edx |
1043 | je L(ret12) |
1044 | setl %al |
1045 | negl %eax |
1046 | xorl %r8d, %eax |
1047 | # else |
1048 | movzbl (%rdi, %rcx), %eax |
1049 | movzbl (%rsi, %rcx), %ecx |
1050 | TOLOWER_gpr (%rax, %eax) |
1051 | TOLOWER_gpr (%rcx, %ecx) |
1052 | subl %ecx, %eax |
1053 | xorl %r8d, %eax |
1054 | subl %r8d, %eax |
1055 | # endif |
1056 | L(ret12): |
1057 | VZEROUPPER_RETURN |
1058 | |
1059 | # ifdef USE_AS_STRNCMP |
1060 | .p2align 4,, 10 |
1061 | L(check_ret_vec_page_cross2): |
1062 | incl %ecx |
1063 | L(check_ret_vec_page_cross): |
1064 | tzcntl %ecx, %ecx |
1065 | addl %OFFSET_REG, %ecx |
1066 | cmpq %rcx, %rdx |
1067 | ja L(ret_vec_page_cross_cont) |
1068 | .p2align 4,, 2 |
1069 | L(ret_zero_page_cross): |
1070 | xorl %eax, %eax |
1071 | VZEROUPPER_RETURN |
1072 | # endif |
1073 | |
1074 | .p2align 4,, 4 |
1075 | L(page_cross_s2): |
1076 | /* Ensure this is a true page cross. */ |
1077 | subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx |
1078 | jbe L(no_page_cross) |
1079 | |
1080 | |
1081 | movl %ecx, %eax |
1082 | movq %rdi, %rcx |
1083 | movq %rsi, %rdi |
1084 | movq %rcx, %rsi |
1085 | |
1086 | /* set r8 to negate return value as rdi and rsi swapped. */ |
1087 | # ifdef USE_AS_WCSCMP |
1088 | movl $-4, %r8d |
1089 | # else |
1090 | movl $-1, %r8d |
1091 | # endif |
1092 | xorl %OFFSET_REG, %OFFSET_REG |
1093 | |
1094 | /* Check if more than 1x VEC till page cross. */ |
1095 | subl $(VEC_SIZE * 3), %eax |
1096 | jle L(page_cross_loop) |
1097 | |
1098 | .p2align 4,, 6 |
1099 | L(less_1x_vec_till_page): |
1100 | /* Find largest load size we can use. */ |
1101 | cmpl $16, %eax |
1102 | ja L(less_16_till_page) |
1103 | |
1104 | VMOVU (%rdi), %xmm0 |
1105 | CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) |
1106 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1107 | vpandn %xmm1, %xmm2, %xmm1 |
1108 | vpmovmskb %ymm1, %ecx |
1109 | incw %cx |
1110 | jnz L(check_ret_vec_page_cross) |
1111 | movl $16, %OFFSET_REG |
1112 | # ifdef USE_AS_STRNCMP |
1113 | cmpq %OFFSET_REG64, %rdx |
1114 | jbe L(ret_zero_page_cross_slow_case0) |
1115 | subl %eax, %OFFSET_REG |
1116 | # else |
1117 | /* Explicit check for 16 byte alignment. */ |
1118 | subl %eax, %OFFSET_REG |
1119 | jz L(prepare_loop) |
1120 | # endif |
1121 | |
1122 | VMOVU (%rdi, %OFFSET_REG64), %xmm0 |
1123 | CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) |
1124 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1125 | vpandn %xmm1, %xmm2, %xmm1 |
1126 | vpmovmskb %ymm1, %ecx |
1127 | incw %cx |
1128 | jnz L(check_ret_vec_page_cross) |
1129 | |
1130 | # ifdef USE_AS_STRNCMP |
1131 | addl $16, %OFFSET_REG |
1132 | subq %OFFSET_REG64, %rdx |
1133 | jbe L(ret_zero_page_cross_slow_case0) |
1134 | subq $-(VEC_SIZE * 4), %rdx |
1135 | |
1136 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1137 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1138 | # else |
1139 | leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1140 | leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1141 | # endif |
1142 | jmp L(prepare_loop_aligned) |
1143 | |
1144 | # ifdef USE_AS_STRNCMP |
1145 | .p2align 4,, 2 |
1146 | L(ret_zero_page_cross_slow_case0): |
1147 | xorl %eax, %eax |
1148 | ret |
1149 | # endif |
1150 | |
1151 | |
1152 | .p2align 4,, 10 |
1153 | L(less_16_till_page): |
1154 | /* Find largest load size we can use. */ |
1155 | cmpl $24, %eax |
1156 | ja L(less_8_till_page) |
1157 | |
1158 | vmovq (%rdi), %xmm0 |
1159 | vmovq (%rsi), %xmm1 |
1160 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1161 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
1162 | vpandn %xmm1, %xmm2, %xmm1 |
1163 | vpmovmskb %ymm1, %ecx |
1164 | incb %cl |
1165 | jnz L(check_ret_vec_page_cross) |
1166 | |
1167 | |
1168 | # ifdef USE_AS_STRNCMP |
1169 | cmpq $8, %rdx |
1170 | jbe L(ret_zero_page_cross_slow_case0) |
1171 | # endif |
1172 | movl $24, %OFFSET_REG |
1173 | /* Explicit check for 16 byte alignment. */ |
1174 | subl %eax, %OFFSET_REG |
1175 | |
1176 | |
1177 | |
1178 | vmovq (%rdi, %OFFSET_REG64), %xmm0 |
1179 | vmovq (%rsi, %OFFSET_REG64), %xmm1 |
1180 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1181 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
1182 | vpandn %xmm1, %xmm2, %xmm1 |
1183 | vpmovmskb %ymm1, %ecx |
1184 | incb %cl |
1185 | jnz L(check_ret_vec_page_cross) |
1186 | |
1187 | # ifdef USE_AS_STRNCMP |
1188 | addl $8, %OFFSET_REG |
1189 | subq %OFFSET_REG64, %rdx |
1190 | jbe L(ret_zero_page_cross_slow_case0) |
1191 | subq $-(VEC_SIZE * 4), %rdx |
1192 | |
1193 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1194 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1195 | # else |
1196 | leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1197 | leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1198 | # endif |
1199 | jmp L(prepare_loop_aligned) |
1200 | |
1201 | |
1202 | .p2align 4,, 10 |
1203 | L(less_8_till_page): |
1204 | # ifdef USE_AS_WCSCMP |
1205 | /* If using wchar then this is the only check before we reach |
1206 | the page boundary. */ |
1207 | movl (%rdi), %eax |
1208 | movl (%rsi), %ecx |
1209 | cmpl %ecx, %eax |
1210 | jnz L(ret_less_8_wcs) |
1211 | # ifdef USE_AS_STRNCMP |
1212 | addq %rdi, %rdx |
1213 | /* We already checked for len <= 1 so cannot hit that case here. |
1214 | */ |
1215 | # endif |
1216 | testl %eax, %eax |
1217 | jnz L(prepare_loop_no_len) |
1218 | ret |
1219 | |
1220 | .p2align 4,, 8 |
1221 | L(ret_less_8_wcs): |
1222 | setl %OFFSET_REG8 |
1223 | negl %OFFSET_REG |
1224 | movl %OFFSET_REG, %eax |
1225 | xorl %r8d, %eax |
1226 | ret |
1227 | |
1228 | # else |
1229 | |
1230 | /* Find largest load size we can use. */ |
1231 | cmpl $28, %eax |
1232 | ja L(less_4_till_page) |
1233 | |
1234 | vmovd (%rdi), %xmm0 |
1235 | vmovd (%rsi), %xmm1 |
1236 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1237 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
1238 | vpandn %xmm1, %xmm2, %xmm1 |
1239 | vpmovmskb %ymm1, %ecx |
1240 | subl $0xf, %ecx |
1241 | jnz L(check_ret_vec_page_cross) |
1242 | |
1243 | # ifdef USE_AS_STRNCMP |
1244 | cmpq $4, %rdx |
1245 | jbe L(ret_zero_page_cross_slow_case1) |
1246 | # endif |
1247 | movl $28, %OFFSET_REG |
1248 | /* Explicit check for 16 byte alignment. */ |
1249 | subl %eax, %OFFSET_REG |
1250 | |
1251 | |
1252 | |
1253 | vmovd (%rdi, %OFFSET_REG64), %xmm0 |
1254 | vmovd (%rsi, %OFFSET_REG64), %xmm1 |
1255 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1256 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
1257 | vpandn %xmm1, %xmm2, %xmm1 |
1258 | vpmovmskb %ymm1, %ecx |
1259 | subl $0xf, %ecx |
1260 | jnz L(check_ret_vec_page_cross) |
1261 | |
1262 | # ifdef USE_AS_STRNCMP |
1263 | addl $4, %OFFSET_REG |
1264 | subq %OFFSET_REG64, %rdx |
1265 | jbe L(ret_zero_page_cross_slow_case1) |
1266 | subq $-(VEC_SIZE * 4), %rdx |
1267 | |
1268 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1269 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1270 | # else |
1271 | leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1272 | leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1273 | # endif |
1274 | jmp L(prepare_loop_aligned) |
1275 | |
1276 | # ifdef USE_AS_STRNCMP |
1277 | .p2align 4,, 2 |
1278 | L(ret_zero_page_cross_slow_case1): |
1279 | xorl %eax, %eax |
1280 | ret |
1281 | # endif |
1282 | |
1283 | .p2align 4,, 10 |
1284 | L(less_4_till_page): |
1285 | subq %rdi, %rsi |
1286 | /* Extremely slow byte comparison loop. */ |
1287 | L(less_4_loop): |
1288 | movzbl (%rdi), %eax |
1289 | movzbl (%rsi, %rdi), %ecx |
1290 | TOLOWER_gpr (%rax, %eax) |
1291 | TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) |
1292 | subl %BYTE_LOOP_REG, %eax |
1293 | jnz L(ret_less_4_loop) |
1294 | testl %ecx, %ecx |
1295 | jz L(ret_zero_4_loop) |
1296 | # ifdef USE_AS_STRNCMP |
1297 | decq %rdx |
1298 | jz L(ret_zero_4_loop) |
1299 | # endif |
1300 | incq %rdi |
1301 | /* end condition is reach page boundary (rdi is aligned). */ |
1302 | testl $31, %edi |
1303 | jnz L(less_4_loop) |
1304 | leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi |
1305 | addq $-(VEC_SIZE * 4), %rdi |
1306 | # ifdef USE_AS_STRNCMP |
1307 | subq $-(VEC_SIZE * 4), %rdx |
1308 | # endif |
1309 | jmp L(prepare_loop_aligned) |
1310 | |
1311 | L(ret_zero_4_loop): |
1312 | xorl %eax, %eax |
1313 | ret |
1314 | L(ret_less_4_loop): |
1315 | xorl %r8d, %eax |
1316 | subl %r8d, %eax |
1317 | ret |
1318 | # endif |
1319 | cfi_endproc |
1320 | .size STRCMP, .-STRCMP |
1321 | #endif |
1322 | |