1/* memcmp with SSE2
2 Copyright (C) 2009-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_WMEMCMP
22# define PCMPEQ pcmpeqd
23# define CHAR_SIZE 4
24# define SIZE_OFFSET (0)
25#else
26# define PCMPEQ pcmpeqb
27# define CHAR_SIZE 1
28#endif
29
30#ifdef USE_AS_MEMCMPEQ
31# define SIZE_OFFSET (0)
32# define CHECK_CMP(x, y) subl x, y
33#else
34# ifndef SIZE_OFFSET
35# define SIZE_OFFSET (CHAR_PER_VEC * 2)
36# endif
37# define CHECK_CMP(x, y) cmpl x, y
38#endif
39
40#define VEC_SIZE 16
41#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
42
43#ifndef MEMCMP
44# define MEMCMP memcmp
45#endif
46
47 .text
48ENTRY(MEMCMP)
49# ifdef __ILP32__
50 /* Clear the upper 32 bits. */
51 movl %edx, %edx
52# endif
53#ifdef USE_AS_WMEMCMP
54 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
55 in ecx for code size. This is preferable to using `incw` as
56 it avoids partial register stalls on older hardware (pre
57 SnB). */
58 movl $0xffff, %ecx
59#endif
60 cmpq $CHAR_PER_VEC, %rdx
61 ja L(more_1x_vec)
62
63#ifdef USE_AS_WMEMCMP
64 /* saves a byte of code keeping the fall through path n = [2, 4]
65 in the initial cache line. */
66 decl %edx
67 jle L(cmp_0_1)
68
69 movq (%rsi), %xmm0
70 movq (%rdi), %xmm1
71 PCMPEQ %xmm0, %xmm1
72 pmovmskb %xmm1, %eax
73 subl %ecx, %eax
74 jnz L(ret_nonzero_vec_start_0)
75
76 movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
77 movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
78 PCMPEQ %xmm0, %xmm1
79 pmovmskb %xmm1, %eax
80 subl %ecx, %eax
81 jnz L(ret_nonzero_vec_end_0_adj)
82#else
83 cmpl $8, %edx
84 ja L(cmp_9_16)
85
86 cmpl $4, %edx
87 jb L(cmp_0_3)
88
89# ifdef USE_AS_MEMCMPEQ
90 movl (%rsi), %eax
91 subl (%rdi), %eax
92
93 movl -4(%rsi, %rdx), %esi
94 subl -4(%rdi, %rdx), %esi
95
96 orl %esi, %eax
97 ret
98# else
99 /* Combine comparisons for lo and hi 4-byte comparisons. */
100 movl -4(%rsi, %rdx), %ecx
101 movl -4(%rdi, %rdx), %eax
102 shlq $32, %rcx
103 shlq $32, %rax
104 movl (%rsi), %esi
105 movl (%rdi), %edi
106 orq %rsi, %rcx
107 orq %rdi, %rax
108 /* Only compute proper return if not-equal. */
109 cmpq %rcx, %rax
110 jnz L(ret_nonzero)
111 xorl %eax, %eax
112 ret
113# endif
114
115 .p2align 4,, 10
116L(cmp_9_16):
117# ifdef USE_AS_MEMCMPEQ
118 movq (%rsi), %rax
119 subq (%rdi), %rax
120
121 movq -8(%rsi, %rdx), %rcx
122 subq -8(%rdi, %rdx), %rcx
123 orq %rcx, %rax
124 /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
125 return long). */
126 setnz %cl
127 movzbl %cl, %eax
128# else
129 movq (%rsi), %rcx
130 movq (%rdi), %rax
131 /* Only compute proper return if not-equal. */
132 cmpq %rcx, %rax
133 jnz L(ret_nonzero)
134
135 movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
136 movq -8(%rdi, %rdx, CHAR_SIZE), %rax
137 /* Only compute proper return if not-equal. */
138 cmpq %rcx, %rax
139 jnz L(ret_nonzero)
140 xorl %eax, %eax
141# endif
142#endif
143 ret
144
145 .p2align 4,, 8
146L(cmp_0_1):
147 /* Flag set by earlier comparison against 1. */
148 jne L(cmp_0_0)
149#ifdef USE_AS_WMEMCMP
150 movl (%rdi), %ecx
151 xorl %edx, %edx
152 cmpl (%rsi), %ecx
153 je L(cmp_0_0)
154 setg %dl
155 leal -1(%rdx, %rdx), %eax
156#else
157 movzbl (%rdi), %eax
158 movzbl (%rsi), %ecx
159 subl %ecx, %eax
160#endif
161 ret
162
163 /* Fits in aligning bytes. */
164L(cmp_0_0):
165 xorl %eax, %eax
166 ret
167
168#ifdef USE_AS_WMEMCMP
169 .p2align 4
170L(ret_nonzero_vec_start_0):
171 bsfl %eax, %eax
172 movl (%rdi, %rax), %ecx
173 xorl %edx, %edx
174 cmpl (%rsi, %rax), %ecx
175 /* NB: no partial register stall here because xorl zero idiom
176 above. */
177 setg %dl
178 leal -1(%rdx, %rdx), %eax
179 ret
180#else
181
182# ifndef USE_AS_MEMCMPEQ
183 .p2align 4,, 14
184L(ret_nonzero):
185 /* Need to bswap to get proper return without branch. */
186 bswapq %rcx
187 bswapq %rax
188 subq %rcx, %rax
189 sbbl %eax, %eax
190 orl $1, %eax
191 ret
192# endif
193
194 .p2align 4
195L(cmp_0_3):
196# ifdef USE_AS_MEMCMPEQ
197 /* No reason to add to dependency chain on rdx. Saving a the
198 bytes here doesn't change number of fetch blocks. */
199 cmpl $1, %edx
200 jbe L(cmp_0_1)
201# else
202 /* We need the code size to prevent taking an extra fetch block.
203 */
204 decl %edx
205 jle L(cmp_0_1)
206# endif
207 movzwl (%rsi), %ecx
208 movzwl (%rdi), %eax
209
210# ifdef USE_AS_MEMCMPEQ
211 subl %ecx, %eax
212
213 movzbl -1(%rsi, %rdx), %esi
214 movzbl -1(%rdi, %rdx), %edi
215 subl %edi, %esi
216 orl %esi, %eax
217# else
218 bswapl %ecx
219 bswapl %eax
220
221 /* Implicit right shift by one. We just need to displace the
222 sign bits. */
223 shrl %ecx
224 shrl %eax
225
226 /* Eat a partial register stall here. Saves code stopping
227 L(cmp_0_3) from bleeding into the next fetch block and saves
228 an ALU. */
229 movb (%rsi, %rdx), %cl
230 movzbl (%rdi, %rdx), %edi
231 orl %edi, %eax
232 subl %ecx, %eax
233# endif
234 ret
235#endif
236
237 .p2align 5
238L(more_1x_vec):
239#ifndef USE_AS_WMEMCMP
240 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
241 in ecx for code size. This is preferable to using `incw` as
242 it avoids partial register stalls on older hardware (pre
243 SnB). */
244 movl $0xffff, %ecx
245#endif
246 movups (%rsi), %xmm0
247 movups (%rdi), %xmm1
248 PCMPEQ %xmm0, %xmm1
249 pmovmskb %xmm1, %eax
250 subl %ecx, %eax
251 jnz L(ret_nonzero_vec_start_0)
252#if SIZE_OFFSET == 0
253 cmpq $(CHAR_PER_VEC * 2), %rdx
254#else
255 /* Offset rdx. Saves just enough code size to keep the
256 L(last_2x_vec) case and the non-zero return in a single
257 cache line. */
258 subq $(CHAR_PER_VEC * 2), %rdx
259#endif
260 ja L(more_2x_vec)
261
262 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
263 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
264 PCMPEQ %xmm0, %xmm1
265 pmovmskb %xmm1, %eax
266 subl %ecx, %eax
267#ifndef USE_AS_MEMCMPEQ
268 /* Don't use `incw ax` as machines this code runs on are liable
269 to have partial register stall. */
270 jnz L(ret_nonzero_vec_end_0)
271#else
272 /* Various return targets for memcmpeq. Will always be hot in
273 Icache and get short encoding. */
274L(ret_nonzero_vec_start_1):
275L(ret_nonzero_vec_start_0):
276L(ret_nonzero_vec_end_0):
277#endif
278 ret
279
280#ifndef USE_AS_MEMCMPEQ
281# ifdef USE_AS_WMEMCMP
282 .p2align 4
283L(ret_nonzero_vec_end_0_adj):
284 addl $3, %edx
285# else
286 .p2align 4,, 8
287# endif
288L(ret_nonzero_vec_end_0):
289 bsfl %eax, %eax
290# ifdef USE_AS_WMEMCMP
291 leal (%rax, %rdx, CHAR_SIZE), %eax
292 movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
293 xorl %edx, %edx
294 cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
295 /* NB: no partial register stall here because xorl zero idiom
296 above. */
297 setg %dl
298 leal -1(%rdx, %rdx), %eax
299# else
300 /* Use `addq` instead of `addl` here so that even if `rax` + `rdx`
301 is negative value of the sum will be usable as a 64-bit offset
302 (negative 32-bit numbers zero-extend to a large and often
303 out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is
304 an invariant when `memcmp` is used correctly, but if the input
305 strings `rsi`/`rdi` are concurrently modified as the function
306 runs (there is a Data-Race) it is possible for `rax` + `rdx` to
307 be negative. Given that there is virtually no extra to cost
308 using `addq` instead of `addl` we may as well protect the
309 data-race case. */
310 addq %rdx, %rax
311 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
312 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
313 subl %ecx, %eax
314# endif
315 ret
316# ifndef USE_AS_WMEMCMP
317 .p2align 4,, 10
318L(ret_nonzero_vec_start_0):
319 bsfl %eax, %eax
320 movzbl (%rsi, %rax), %ecx
321 movzbl (%rdi, %rax), %eax
322 subl %ecx, %eax
323 ret
324# endif
325#else
326#endif
327
328 .p2align 5
329L(more_2x_vec):
330 movups (VEC_SIZE * 1)(%rsi), %xmm0
331 movups (VEC_SIZE * 1)(%rdi), %xmm1
332 PCMPEQ %xmm0, %xmm1
333 pmovmskb %xmm1, %eax
334 subl %ecx, %eax
335 jnz L(ret_nonzero_vec_start_1)
336
337 cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
338 jbe L(last_2x_vec)
339
340 cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
341 ja L(more_8x_vec)
342
343 /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
344 This can harm performance if non-zero return in [65, 80] or
345 [97, 112] but helps performance otherwise. Generally zero-
346 return is hotter. */
347 movups (VEC_SIZE * 2)(%rsi), %xmm0
348 movups (VEC_SIZE * 2)(%rdi), %xmm1
349 PCMPEQ %xmm0, %xmm1
350 movups (VEC_SIZE * 3)(%rsi), %xmm2
351 movups (VEC_SIZE * 3)(%rdi), %xmm3
352 PCMPEQ %xmm2, %xmm3
353 pand %xmm1, %xmm3
354
355 pmovmskb %xmm3, %eax
356 CHECK_CMP (%ecx, %eax)
357 jnz L(ret_nonzero_vec_start_2_3)
358
359 cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
360 jbe L(last_2x_vec)
361
362 movups (VEC_SIZE * 4)(%rsi), %xmm0
363 movups (VEC_SIZE * 4)(%rdi), %xmm1
364 PCMPEQ %xmm0, %xmm1
365 movups (VEC_SIZE * 5)(%rsi), %xmm2
366 movups (VEC_SIZE * 5)(%rdi), %xmm3
367 PCMPEQ %xmm2, %xmm3
368 pand %xmm1, %xmm3
369
370 pmovmskb %xmm3, %eax
371 CHECK_CMP (%ecx, %eax)
372#ifdef USE_AS_MEMCMPEQ
373 jz L(last_2x_vec)
374 ret
375#else
376 jnz L(ret_nonzero_vec_start_4_5)
377#endif
378 .p2align 4
379L(last_2x_vec):
380 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
381 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
382 PCMPEQ %xmm0, %xmm1
383 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
384 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
385 PCMPEQ %xmm2, %xmm3
386 pand %xmm1, %xmm3
387 pmovmskb %xmm3, %eax
388 subl %ecx, %eax
389#ifdef USE_AS_MEMCMPEQ
390 /* Various return targets for memcmpeq. Will always be hot in
391 Icache and get short encoding. */
392L(ret_nonzero_vec_start_2_3):
393L(ret_nonzero_vec_start_4_5):
394 ret
395#else
396 jnz L(ret_nonzero_vec_end_1)
397 ret
398
399 .p2align 4,, 8
400L(ret_nonzero_vec_end_1):
401 pmovmskb %xmm1, %ecx
402 /* High 16 bits of eax guranteed to be all ones. Rotate them in
403 to we can do `or + not` with just `xor`. */
404 rorl $16, %eax
405 xorl %ecx, %eax
406 /* Partial register stall. */
407
408 bsfl %eax, %eax
409# ifdef USE_AS_WMEMCMP
410 leal (%rax, %rdx, CHAR_SIZE), %eax
411 movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
412 xorl %edx, %edx
413 cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
414 /* NB: no partial register stall here because xorl zero idiom
415 above. */
416 setg %dl
417 leal -1(%rdx, %rdx), %eax
418# else
419 addl %edx, %eax
420 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
421 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
422 subl %ecx, %eax
423# endif
424 ret
425
426 .p2align 4
427L(ret_nonzero_vec_start_4_5):
428 pmovmskb %xmm1, %edx
429 sall $16, %eax
430 leal 1(%rax, %rdx), %eax
431 bsfl %eax, %eax
432# ifdef USE_AS_WMEMCMP
433 movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
434 xorl %edx, %edx
435 cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
436 /* NB: no partial register stall here because xorl zero idiom
437 above. */
438 setg %dl
439 leal -1(%rdx, %rdx), %eax
440# else
441 movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
442 movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
443 subl %ecx, %eax
444# endif
445 ret
446
447 .p2align 4,, 8
448L(ret_nonzero_vec_start_1):
449 bsfl %eax, %eax
450# ifdef USE_AS_WMEMCMP
451 movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
452 xorl %edx, %edx
453 cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
454 /* NB: no partial register stall here because xorl zero idiom
455 above. */
456 setg %dl
457 leal -1(%rdx, %rdx), %eax
458# else
459 movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
460 movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
461 subl %ecx, %eax
462# endif
463 ret
464#endif
465
466 .p2align 4
467L(more_8x_vec):
468 subq %rdi, %rsi
469 leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
470 andq $(VEC_SIZE * -1), %rdi
471 addq %rdi, %rsi
472 .p2align 4
473L(loop_4x):
474 movups (VEC_SIZE * 2)(%rsi), %xmm0
475 movups (VEC_SIZE * 3)(%rsi), %xmm1
476
477 PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
478 PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
479
480 movups (VEC_SIZE * 4)(%rsi), %xmm2
481 movups (VEC_SIZE * 5)(%rsi), %xmm3
482
483 PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
484 PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
485
486 pand %xmm0, %xmm1
487 pand %xmm2, %xmm3
488 pand %xmm1, %xmm3
489
490 pmovmskb %xmm3, %eax
491 subl %ecx, %eax
492 jnz L(ret_nonzero_loop)
493
494 addq $(VEC_SIZE * 4), %rdi
495 addq $(VEC_SIZE * 4), %rsi
496 cmpq %rdi, %rdx
497 ja L(loop_4x)
498 /* Get remaining length in edx. */
499 subl %edi, %edx
500 /* Restore offset so we can reuse L(last_2x_vec). */
501 addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
502#ifdef USE_AS_WMEMCMP
503 shrl $2, %edx
504#endif
505 cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
506 jbe L(last_2x_vec)
507
508
509 movups (VEC_SIZE * 2)(%rsi), %xmm0
510 movups (VEC_SIZE * 2)(%rdi), %xmm1
511 PCMPEQ %xmm0, %xmm1
512 movups (VEC_SIZE * 3)(%rsi), %xmm2
513 movups (VEC_SIZE * 3)(%rdi), %xmm3
514 PCMPEQ %xmm2, %xmm3
515 pand %xmm1, %xmm3
516
517 pmovmskb %xmm3, %eax
518 CHECK_CMP (%ecx, %eax)
519 jz L(last_2x_vec)
520#ifdef USE_AS_MEMCMPEQ
521L(ret_nonzero_loop):
522 ret
523#else
524
525 .p2align 4
526L(ret_nonzero_vec_start_2_3):
527 pmovmskb %xmm1, %edx
528 sall $16, %eax
529 leal 1(%rax, %rdx), %eax
530
531 bsfl %eax, %eax
532# ifdef USE_AS_WMEMCMP
533 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
534 xorl %edx, %edx
535 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
536 /* NB: no partial register stall here because xorl zero idiom
537 above. */
538 setg %dl
539 leal -1(%rdx, %rdx), %eax
540# else
541 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
542 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
543 subl %ecx, %eax
544# endif
545 ret
546
547 .p2align 4
548L(ret_nonzero_loop):
549 pmovmskb %xmm0, %ecx
550 pmovmskb %xmm1, %edx
551 sall $(VEC_SIZE * 1), %edx
552 leal 1(%rcx, %rdx), %edx
553 pmovmskb %xmm2, %ecx
554 /* High 16 bits of eax guranteed to be all ones. Rotate them in
555 to we can do `or + not` with just `xor`. */
556 rorl $16, %eax
557 xorl %ecx, %eax
558
559 salq $32, %rax
560 orq %rdx, %rax
561
562 bsfq %rax, %rax
563# ifdef USE_AS_WMEMCMP
564 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
565 xorl %edx, %edx
566 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
567 /* NB: no partial register stall here because xorl zero idiom
568 above. */
569 setg %dl
570 leal -1(%rdx, %rdx), %eax
571# else
572 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
573 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
574 subl %ecx, %eax
575# endif
576 ret
577#endif
578END(MEMCMP)
579
580#ifndef USE_AS_WMEMCMP
581# ifdef USE_AS_MEMCMPEQ
582libc_hidden_def (MEMCMP)
583# else
584# undef bcmp
585weak_alias (MEMCMP, bcmp)
586libc_hidden_builtin_def (MEMCMP)
587# endif
588#endif
589

source code of glibc/sysdeps/x86_64/memcmp.S