1/* memcmp/wmemcmp optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21/* memcmp/wmemcmp is implemented as:
22 1. Use ymm vector compares when possible. The only case where
23 vector compares is not possible for when size < VEC_SIZE
24 and loading from either s1 or s2 would cause a page cross.
25 2. For size from 2 to 7 bytes on page cross, load as big endian
26 with movbe and bswap to avoid branches.
27 3. Use xmm vector compare when size >= 4 bytes for memcmp or
28 size >= 8 bytes for wmemcmp.
29 4. Optimistically compare up to first 4 * VEC_SIZE one at a
30 to check for early mismatches. Only do this if its guranteed the
31 work is not wasted.
32 5. If size is 8 * VEC_SIZE or less, unroll the loop.
33 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
34 area.
35 7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
36 8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
37 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
38
39
40# include <sysdep.h>
41
42# ifndef MEMCMP
43# define MEMCMP __memcmp_avx2_movbe
44# endif
45
46# ifdef USE_AS_WMEMCMP
47# define CHAR_SIZE 4
48# define VPCMPEQ vpcmpeqd
49# else
50# define CHAR_SIZE 1
51# define VPCMPEQ vpcmpeqb
52# endif
53
54# ifndef VZEROUPPER
55# define VZEROUPPER vzeroupper
56# endif
57
58# ifndef SECTION
59# define SECTION(p) p##.avx
60# endif
61
62# define VEC_SIZE 32
63# define PAGE_SIZE 4096
64
65/* Warning!
66 wmemcmp has to use SIGNED comparison for elements.
67 memcmp has to use UNSIGNED comparison for elemnts.
68*/
69
70 .section SECTION(.text),"ax",@progbits
71ENTRY (MEMCMP)
72# ifdef USE_AS_WMEMCMP
73 shl $2, %RDX_LP
74# elif defined __ILP32__
75 /* Clear the upper 32 bits. */
76 movl %edx, %edx
77# endif
78 cmp $VEC_SIZE, %RDX_LP
79 jb L(less_vec)
80
81 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
82 vmovdqu (%rsi), %ymm1
83 VPCMPEQ (%rdi), %ymm1, %ymm1
84 vpmovmskb %ymm1, %eax
85 /* NB: eax must be destination register if going to
86 L(return_vec_[0,2]). For L(return_vec_3 destination register
87 must be ecx. */
88 incl %eax
89 jnz L(return_vec_0)
90
91 cmpq $(VEC_SIZE * 2), %rdx
92 jbe L(last_1x_vec)
93
94 /* Check second VEC no matter what. */
95 vmovdqu VEC_SIZE(%rsi), %ymm2
96 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
97 vpmovmskb %ymm2, %eax
98 /* If all 4 VEC where equal eax will be all 1s so incl will
99 overflow and set zero flag. */
100 incl %eax
101 jnz L(return_vec_1)
102
103 /* Less than 4 * VEC. */
104 cmpq $(VEC_SIZE * 4), %rdx
105 jbe L(last_2x_vec)
106
107 /* Check third and fourth VEC no matter what. */
108 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
109 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
110 vpmovmskb %ymm3, %eax
111 incl %eax
112 jnz L(return_vec_2)
113 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
114 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
115 vpmovmskb %ymm4, %ecx
116 incl %ecx
117 jnz L(return_vec_3)
118
119 /* Go to 4x VEC loop. */
120 cmpq $(VEC_SIZE * 8), %rdx
121 ja L(more_8x_vec)
122
123 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
124 branches. */
125
126 /* Load first two VEC from s2 before adjusting addresses. */
127 vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
128 vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
129 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
130 leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
131
132 /* Wait to load from s1 until addressed adjust due to
133 unlamination of microfusion with complex address mode. */
134 VPCMPEQ (%rdi), %ymm1, %ymm1
135 VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
136
137 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
138 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
139 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
140 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
141
142 /* Reduce VEC0 - VEC4. */
143 vpand %ymm1, %ymm2, %ymm5
144 vpand %ymm3, %ymm4, %ymm6
145 vpand %ymm5, %ymm6, %ymm7
146 vpmovmskb %ymm7, %ecx
147 incl %ecx
148 jnz L(return_vec_0_1_2_3)
149 /* NB: eax must be zero to reach here. */
150 VZEROUPPER_RETURN
151
152 .p2align 4
153L(return_vec_0):
154 tzcntl %eax, %eax
155# ifdef USE_AS_WMEMCMP
156 movl (%rdi, %rax), %ecx
157 xorl %edx, %edx
158 cmpl (%rsi, %rax), %ecx
159 /* NB: no partial register stall here because xorl zero idiom
160 above. */
161 setg %dl
162 leal -1(%rdx, %rdx), %eax
163# else
164 movzbl (%rsi, %rax), %ecx
165 movzbl (%rdi, %rax), %eax
166 subl %ecx, %eax
167# endif
168L(return_vzeroupper):
169 ZERO_UPPER_VEC_REGISTERS_RETURN
170
171 .p2align 4
172L(return_vec_1):
173 tzcntl %eax, %eax
174# ifdef USE_AS_WMEMCMP
175 movl VEC_SIZE(%rdi, %rax), %ecx
176 xorl %edx, %edx
177 cmpl VEC_SIZE(%rsi, %rax), %ecx
178 setg %dl
179 leal -1(%rdx, %rdx), %eax
180# else
181 movzbl VEC_SIZE(%rsi, %rax), %ecx
182 movzbl VEC_SIZE(%rdi, %rax), %eax
183 subl %ecx, %eax
184# endif
185 VZEROUPPER_RETURN
186
187 .p2align 4
188L(return_vec_2):
189 tzcntl %eax, %eax
190# ifdef USE_AS_WMEMCMP
191 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
192 xorl %edx, %edx
193 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
194 setg %dl
195 leal -1(%rdx, %rdx), %eax
196# else
197 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
198 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
199 subl %ecx, %eax
200# endif
201 VZEROUPPER_RETURN
202
203 /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */
204 .p2align 5
205L(8x_return_vec_0_1_2_3):
206 /* Returning from L(more_8x_vec) requires restoring rsi. */
207 addq %rdi, %rsi
208L(return_vec_0_1_2_3):
209 vpmovmskb %ymm1, %eax
210 incl %eax
211 jnz L(return_vec_0)
212
213 vpmovmskb %ymm2, %eax
214 incl %eax
215 jnz L(return_vec_1)
216
217 vpmovmskb %ymm3, %eax
218 incl %eax
219 jnz L(return_vec_2)
220L(return_vec_3):
221 tzcntl %ecx, %ecx
222# ifdef USE_AS_WMEMCMP
223 movl (VEC_SIZE * 3)(%rdi, %rcx), %eax
224 xorl %edx, %edx
225 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax
226 setg %dl
227 leal -1(%rdx, %rdx), %eax
228# else
229 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
230 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
231 subl %ecx, %eax
232# endif
233 VZEROUPPER_RETURN
234
235 .p2align 4
236L(more_8x_vec):
237 /* Set end of s1 in rdx. */
238 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
239 /* rsi stores s2 - s1. This allows loop to only update one
240 pointer. */
241 subq %rdi, %rsi
242 /* Align s1 pointer. */
243 andq $-VEC_SIZE, %rdi
244 /* Adjust because first 4x vec where check already. */
245 subq $-(VEC_SIZE * 4), %rdi
246 .p2align 4
247L(loop_4x_vec):
248 /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
249 */
250 vmovdqu (%rsi, %rdi), %ymm1
251 VPCMPEQ (%rdi), %ymm1, %ymm1
252
253 vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
254 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
255
256 vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3
257 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
258
259 vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4
260 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
261
262 vpand %ymm1, %ymm2, %ymm5
263 vpand %ymm3, %ymm4, %ymm6
264 vpand %ymm5, %ymm6, %ymm7
265 vpmovmskb %ymm7, %ecx
266 incl %ecx
267 jnz L(8x_return_vec_0_1_2_3)
268 subq $-(VEC_SIZE * 4), %rdi
269 /* Check if s1 pointer at end. */
270 cmpq %rdx, %rdi
271 jb L(loop_4x_vec)
272
273 subq %rdx, %rdi
274 /* rdi has 4 * VEC_SIZE - remaining length. */
275 cmpl $(VEC_SIZE * 3), %edi
276 jae L(8x_last_1x_vec)
277 /* Load regardless of branch. */
278 vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3
279 cmpl $(VEC_SIZE * 2), %edi
280 jae L(8x_last_2x_vec)
281
282 /* Check last 4 VEC. */
283 vmovdqu (%rsi, %rdx), %ymm1
284 VPCMPEQ (%rdx), %ymm1, %ymm1
285
286 vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
287 VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
288
289 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
290
291 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
292 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
293
294 vpand %ymm1, %ymm2, %ymm5
295 vpand %ymm3, %ymm4, %ymm6
296 vpand %ymm5, %ymm6, %ymm7
297 vpmovmskb %ymm7, %ecx
298 /* Restore s1 pointer to rdi. */
299 movq %rdx, %rdi
300 incl %ecx
301 jnz L(8x_return_vec_0_1_2_3)
302 /* NB: eax must be zero to reach here. */
303 VZEROUPPER_RETURN
304
305 /* Only entry is from L(more_8x_vec). */
306 .p2align 4
307L(8x_last_2x_vec):
308 /* Check second to last VEC. rdx store end pointer of s1 and
309 ymm3 has already been loaded with second to last VEC from s2.
310 */
311 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
312 vpmovmskb %ymm3, %eax
313 incl %eax
314 jnz L(8x_return_vec_2)
315 /* Check last VEC. */
316 .p2align 4
317L(8x_last_1x_vec):
318 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
319 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
320 vpmovmskb %ymm4, %eax
321 incl %eax
322 jnz L(8x_return_vec_3)
323 VZEROUPPER_RETURN
324
325 .p2align 4
326L(last_2x_vec):
327 /* Check second to last VEC. */
328 vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
329 VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
330 vpmovmskb %ymm1, %eax
331 incl %eax
332 jnz L(return_vec_1_end)
333 /* Check last VEC. */
334L(last_1x_vec):
335 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
336 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
337 vpmovmskb %ymm1, %eax
338 incl %eax
339 jnz L(return_vec_0_end)
340 VZEROUPPER_RETURN
341
342 .p2align 4
343L(8x_return_vec_2):
344 subq $VEC_SIZE, %rdx
345L(8x_return_vec_3):
346 tzcntl %eax, %eax
347 addq %rdx, %rax
348# ifdef USE_AS_WMEMCMP
349 movl (VEC_SIZE * 3)(%rax), %ecx
350 xorl %edx, %edx
351 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
352 setg %dl
353 leal -1(%rdx, %rdx), %eax
354# else
355 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
356 movzbl (VEC_SIZE * 3)(%rax), %eax
357 subl %ecx, %eax
358# endif
359 VZEROUPPER_RETURN
360
361 .p2align 4
362L(return_vec_1_end):
363 tzcntl %eax, %eax
364 addl %edx, %eax
365# ifdef USE_AS_WMEMCMP
366 movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx
367 xorl %edx, %edx
368 cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
369 setg %dl
370 leal -1(%rdx, %rdx), %eax
371# else
372 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
373 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
374 subl %ecx, %eax
375# endif
376 VZEROUPPER_RETURN
377
378 .p2align 4
379L(return_vec_0_end):
380 tzcntl %eax, %eax
381 addl %edx, %eax
382# ifdef USE_AS_WMEMCMP
383 movl -VEC_SIZE(%rdi, %rax), %ecx
384 xorl %edx, %edx
385 cmpl -VEC_SIZE(%rsi, %rax), %ecx
386 setg %dl
387 leal -1(%rdx, %rdx), %eax
388# else
389 movzbl -VEC_SIZE(%rsi, %rax), %ecx
390 movzbl -VEC_SIZE(%rdi, %rax), %eax
391 subl %ecx, %eax
392# endif
393 VZEROUPPER_RETURN
394
395 .p2align 4
396L(less_vec):
397 /* Check if one or less CHAR. This is necessary for size = 0 but
398 is also faster for size = CHAR_SIZE. */
399 cmpl $CHAR_SIZE, %edx
400 jbe L(one_or_less)
401
402 /* Check if loading one VEC from either s1 or s2 could cause a
403 page cross. This can have false positives but is by far the
404 fastest method. */
405 movl %edi, %eax
406 orl %esi, %eax
407 andl $(PAGE_SIZE - 1), %eax
408 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
409 jg L(page_cross_less_vec)
410
411 /* No page cross possible. */
412 vmovdqu (%rsi), %ymm2
413 VPCMPEQ (%rdi), %ymm2, %ymm2
414 vpmovmskb %ymm2, %eax
415 incl %eax
416 /* Result will be zero if s1 and s2 match. Otherwise first set
417 bit will be first mismatch. */
418 bzhil %edx, %eax, %edx
419 jnz L(return_vec_0)
420 xorl %eax, %eax
421 VZEROUPPER_RETURN
422
423 .p2align 4
424L(page_cross_less_vec):
425 /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
426 bytes. */
427 cmpl $16, %edx
428 jae L(between_16_31)
429# ifndef USE_AS_WMEMCMP
430 cmpl $8, %edx
431 jae L(between_8_15)
432 /* Fall through for [4, 7]. */
433 cmpl $4, %edx
434 jb L(between_2_3)
435
436 movbe (%rdi), %eax
437 movbe (%rsi), %ecx
438 shlq $32, %rax
439 shlq $32, %rcx
440 movbe -4(%rdi, %rdx), %edi
441 movbe -4(%rsi, %rdx), %esi
442 orq %rdi, %rax
443 orq %rsi, %rcx
444 subq %rcx, %rax
445 /* Fast path for return zero. */
446 jnz L(ret_nonzero)
447 /* No ymm register was touched. */
448 ret
449
450 .p2align 4
451L(one_or_less):
452 jb L(zero)
453 movzbl (%rsi), %ecx
454 movzbl (%rdi), %eax
455 subl %ecx, %eax
456 /* No ymm register was touched. */
457 ret
458
459 .p2align 4,, 5
460L(ret_nonzero):
461 sbbl %eax, %eax
462 orl $1, %eax
463 /* No ymm register was touched. */
464 ret
465
466 .p2align 4,, 2
467L(zero):
468 xorl %eax, %eax
469 /* No ymm register was touched. */
470 ret
471
472 .p2align 4
473L(between_8_15):
474 movbe (%rdi), %rax
475 movbe (%rsi), %rcx
476 subq %rcx, %rax
477 jnz L(ret_nonzero)
478 movbe -8(%rdi, %rdx), %rax
479 movbe -8(%rsi, %rdx), %rcx
480 subq %rcx, %rax
481 /* Fast path for return zero. */
482 jnz L(ret_nonzero)
483 /* No ymm register was touched. */
484 ret
485# else
486 /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
487 vmovq (%rdi), %xmm1
488 vmovq (%rsi), %xmm2
489 VPCMPEQ %xmm1, %xmm2, %xmm2
490 vpmovmskb %xmm2, %eax
491 subl $0xffff, %eax
492 jnz L(return_vec_0)
493 /* Use overlapping loads to avoid branches. */
494 leaq -8(%rdi, %rdx), %rdi
495 leaq -8(%rsi, %rdx), %rsi
496 vmovq (%rdi), %xmm1
497 vmovq (%rsi), %xmm2
498 VPCMPEQ %xmm1, %xmm2, %xmm2
499 vpmovmskb %xmm2, %eax
500 subl $0xffff, %eax
501 /* Fast path for return zero. */
502 jnz L(return_vec_0)
503 /* No ymm register was touched. */
504 ret
505# endif
506
507 .p2align 4,, 10
508L(between_16_31):
509 /* From 16 to 31 bytes. No branch when size == 16. */
510 vmovdqu (%rsi), %xmm2
511 VPCMPEQ (%rdi), %xmm2, %xmm2
512 vpmovmskb %xmm2, %eax
513 subl $0xffff, %eax
514 jnz L(return_vec_0)
515
516 /* Use overlapping loads to avoid branches. */
517
518 vmovdqu -16(%rsi, %rdx), %xmm2
519 leaq -16(%rdi, %rdx), %rdi
520 leaq -16(%rsi, %rdx), %rsi
521 VPCMPEQ (%rdi), %xmm2, %xmm2
522 vpmovmskb %xmm2, %eax
523 subl $0xffff, %eax
524 /* Fast path for return zero. */
525 jnz L(return_vec_0)
526 /* No ymm register was touched. */
527 ret
528
529# ifdef USE_AS_WMEMCMP
530 .p2align 4,, 2
531L(zero):
532 xorl %eax, %eax
533 ret
534
535 .p2align 4
536L(one_or_less):
537 jb L(zero)
538 movl (%rdi), %ecx
539 xorl %edx, %edx
540 cmpl (%rsi), %ecx
541 je L(zero)
542 setg %dl
543 leal -1(%rdx, %rdx), %eax
544 /* No ymm register was touched. */
545 ret
546# else
547
548 .p2align 4
549L(between_2_3):
550 /* Load as big endian to avoid branches. */
551 movzwl (%rdi), %eax
552 movzwl (%rsi), %ecx
553 bswap %eax
554 bswap %ecx
555 shrl %eax
556 shrl %ecx
557 movzbl -1(%rdi, %rdx), %edi
558 movzbl -1(%rsi, %rdx), %esi
559 orl %edi, %eax
560 orl %esi, %ecx
561 /* Subtraction is okay because the upper bit is zero. */
562 subl %ecx, %eax
563 /* No ymm register was touched. */
564 ret
565# endif
566
567END (MEMCMP)
568#endif
569

source code of glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S