1/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23
24/* memcmp/wmemcmp is implemented as:
25 1. Use ymm vector compares when possible. The only case where
26 vector compares is not possible for when size < CHAR_PER_VEC
27 and loading from either s1 or s2 would cause a page cross.
28 2. For size from 2 to 7 bytes on page cross, load as big endian
29 with movbe and bswap to avoid branches.
30 3. Use xmm vector compare when size >= 4 bytes for memcmp or
31 size >= 8 bytes for wmemcmp.
32 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
33 to check for early mismatches. Only do this if its guaranteed the
34 work is not wasted.
35 5. If size is 8 * VEC_SIZE or less, unroll the loop.
36 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
37 area.
38 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
39 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
40 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
41
42When possible the implementation tries to optimize for frontend in the
43following ways:
44Throughput:
45 1. All code sections that fit are able to run optimally out of the
46 LSD.
47 2. All code sections that fit are able to run optimally out of the
48 DSB
49 3. Basic blocks are contained in minimum number of fetch blocks
50 necessary.
51
52Latency:
53 1. Logically connected basic blocks are put in the same
54 cache-line.
55 2. Logically connected basic blocks that do not fit in the same
56 cache-line are put in adjacent lines. This can get beneficial
57 L2 spatial prefetching and L1 next-line prefetching. */
58
59# include <sysdep.h>
60
61# ifndef MEMCMP
62# define MEMCMP __memcmp_evex_movbe
63# endif
64
65# ifndef VEC_SIZE
66# include "x86-evex256-vecs.h"
67# endif
68
69# ifdef USE_AS_WMEMCMP
70# define VMOVU_MASK vmovdqu32
71# define CHAR_SIZE 4
72# define VPCMP vpcmpd
73# define VPCMPEQ vpcmpeqd
74# define VPTEST vptestmd
75
76# define USE_WIDE_CHAR
77# else
78# define VMOVU_MASK vmovdqu8
79# define CHAR_SIZE 1
80# define VPCMP vpcmpub
81# define VPCMPEQ vpcmpeqb
82# define VPTEST vptestmb
83# endif
84
85# include "reg-macros.h"
86
87# define PAGE_SIZE 4096
88# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
89
90
91/* Warning!
92 wmemcmp has to use SIGNED comparison for elements.
93 memcmp has to use UNSIGNED comparison for elements.
94*/
95
96 .section SECTION(.text), "ax", @progbits
97/* Cache align memcmp entry. This allows for much more thorough
98 frontend optimization. */
99ENTRY_P2ALIGN (MEMCMP, 6)
100# ifdef __ILP32__
101 /* Clear the upper 32 bits. */
102 movl %edx, %edx
103# endif
104 cmp $CHAR_PER_VEC, %RDX_LP
105 /* Fall through for [0, VEC_SIZE] as its the hottest. */
106 ja L(more_1x_vec)
107
108 /* Create mask of bytes that are guaranteed to be valid because
109 of length (edx). Using masked movs allows us to skip checks
110 for page crosses/zero size. */
111 mov $-1, %VRAX
112 bzhi %VRDX, %VRAX, %VRAX
113 /* NB: A `jz` might be useful here. Page-faults that are
114 invalidated by predicate execution (the evex mask) can be
115 very slow. The expectation is this is not the norm so and
116 "most" code will not regularly call 'memcmp' with length = 0
117 and memory that is not wired up. */
118 KMOV %VRAX, %k2
119
120
121
122 /* Safe to load full ymm with mask. */
123 VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
124 /* Slightly different method for VEC_SIZE == 64 to save a bit of
125 code size. This allows us to fit L(return_vec_0) entirely in
126 the first cache line. */
127# if VEC_SIZE == 64
128 VPCMPEQ (%rdi), %VMM(2), %k1{%k2}
129 KMOV %k1, %VRCX
130 sub %VRCX, %VRAX
131# else
132 VPCMP $4, (%rdi), %VMM(2), %k1{%k2}
133 KMOV %k1, %VRAX
134 test %VRAX, %VRAX
135# endif
136 jnz L(return_vec_0)
137 ret
138
139 .p2align 4,, 11
140L(return_vec_0):
141 bsf %VRAX, %VRAX
142# ifdef USE_AS_WMEMCMP
143 movl (%rdi, %rax, CHAR_SIZE), %ecx
144 xorl %edx, %edx
145 cmpl (%rsi, %rax, CHAR_SIZE), %ecx
146 /* NB: no partial register stall here because xorl zero idiom
147 above. */
148 setg %dl
149 leal -1(%rdx, %rdx), %eax
150# else
151 movzbl (%rsi, %rax), %ecx
152# if VEC_SIZE == 64
153 movb (%rdi, %rax), %al
154# else
155 movzbl (%rdi, %rax), %eax
156# endif
157 subl %ecx, %eax
158# endif
159 ret
160
161 .p2align 4,, 11
162L(more_1x_vec):
163 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
164 VMOVU (%rsi), %VMM(1)
165 /* Use compare not equals to directly check for mismatch. */
166 VPCMP $4, (%rdi), %VMM(1), %k1
167 KMOV %k1, %VRAX
168 /* NB: eax must be destination register if going to
169 L(return_vec_[0,2]). For L(return_vec_3) destination
170 register must be ecx. */
171 test %VRAX, %VRAX
172 jnz L(return_vec_0)
173
174 cmpq $(CHAR_PER_VEC * 2), %rdx
175 jbe L(last_1x_vec)
176
177 /* Check second VEC no matter what. */
178 VMOVU VEC_SIZE(%rsi), %VMM(2)
179 VPCMP $4, VEC_SIZE(%rdi), %VMM(2), %k1
180 KMOV %k1, %VRAX
181 test %VRAX, %VRAX
182 jnz L(return_vec_1)
183
184 /* Less than 4 * VEC. */
185 cmpq $(CHAR_PER_VEC * 4), %rdx
186 jbe L(last_2x_vec)
187
188 /* Check third and fourth VEC no matter what. */
189 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
190 VPCMP $4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
191 KMOV %k1, %VRAX
192 test %VRAX, %VRAX
193 jnz L(return_vec_2)
194
195 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
196 VPCMP $4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
197 KMOV %k1, %VRCX
198 test %VRCX, %VRCX
199 jnz L(return_vec_3)
200
201 /* Go to 4x VEC loop. */
202 cmpq $(CHAR_PER_VEC * 8), %rdx
203 ja L(more_8x_vec)
204
205 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
206 branches. */
207
208 /* Load first two VEC from s2 before adjusting addresses. */
209 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
210 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2)
211 leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
212 leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
213
214 /* Wait to load from s1 until addressed adjust due to
215 unlamination of microfusion with complex address mode. */
216
217 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
218 will have some 1s. */
219 vpxorq (%rdi), %VMM(1), %VMM(1)
220 vpxorq (VEC_SIZE)(%rdi), %VMM(2), %VMM(2)
221
222 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
223 vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
224
225 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
226 /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while
227 oring with VEC(1). Result is stored in VEC(4). */
228 vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
229
230 /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4). */
231 vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
232
233 /* Test VEC(4) against itself. Store any CHAR mismatches in k1.
234 */
235 VPTEST %VMM(4), %VMM(4), %k1
236 /* k1 must go to ecx for L(return_vec_0_1_2_3). */
237 KMOV %k1, %VRCX
238 test %VRCX, %VRCX
239 jnz L(return_vec_0_1_2_3)
240 /* NB: eax must be zero to reach here. */
241 ret
242
243
244 .p2align 4,, 9
245L(8x_end_return_vec_0_1_2_3):
246 movq %rdx, %rdi
247L(8x_return_vec_0_1_2_3):
248 /* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64. */
249# if VEC_SIZE == 64
250 KMOV %k1, %VRCX
251# endif
252 addq %rdi, %rsi
253L(return_vec_0_1_2_3):
254 VPTEST %VMM(1), %VMM(1), %k0
255 KMOV %k0, %VRAX
256 test %VRAX, %VRAX
257 jnz L(return_vec_0)
258
259 VPTEST %VMM(2), %VMM(2), %k0
260 KMOV %k0, %VRAX
261 test %VRAX, %VRAX
262 jnz L(return_vec_1)
263
264 VPTEST %VMM(3), %VMM(3), %k0
265 KMOV %k0, %VRAX
266 test %VRAX, %VRAX
267 jnz L(return_vec_2)
268 .p2align 4,, 2
269L(return_vec_3):
270 /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
271 fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
272 line. */
273 bsf %VRCX, %VRCX
274# ifdef USE_AS_WMEMCMP
275 movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
276 xorl %edx, %edx
277 cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
278 setg %dl
279 leal -1(%rdx, %rdx), %eax
280# else
281 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
282 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
283 subl %ecx, %eax
284# endif
285 ret
286
287
288 .p2align 4,, 8
289L(return_vec_1):
290 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
291 fetch block. */
292 bsf %VRAX, %VRAX
293# ifdef USE_AS_WMEMCMP
294 movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
295 xorl %edx, %edx
296 cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
297 setg %dl
298 leal -1(%rdx, %rdx), %eax
299# else
300 movzbl VEC_SIZE(%rsi, %rax), %ecx
301 movzbl VEC_SIZE(%rdi, %rax), %eax
302 subl %ecx, %eax
303# endif
304 ret
305
306 .p2align 4,, 7
307L(return_vec_2):
308 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
309 fetch block. */
310 bsf %VRAX, %VRAX
311# ifdef USE_AS_WMEMCMP
312 movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
313 xorl %edx, %edx
314 cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
315 setg %dl
316 leal -1(%rdx, %rdx), %eax
317# else
318 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
319 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
320 subl %ecx, %eax
321# endif
322 ret
323
324 .p2align 4,, 8
325L(more_8x_vec):
326 /* Set end of s1 in rdx. */
327 leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
328 /* rsi stores s2 - s1. This allows loop to only update one
329 pointer. */
330 subq %rdi, %rsi
331 /* Align s1 pointer. */
332 andq $-VEC_SIZE, %rdi
333 /* Adjust because first 4x vec where check already. */
334 subq $-(VEC_SIZE * 4), %rdi
335
336 .p2align 4
337L(loop_4x_vec):
338 VMOVU (%rsi, %rdi), %VMM(1)
339 vpxorq (%rdi), %VMM(1), %VMM(1)
340 VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
341 vpxorq VEC_SIZE(%rdi), %VMM(2), %VMM(2)
342 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
343 vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
344 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
345 vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
346 vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
347 VPTEST %VMM(4), %VMM(4), %k1
348 /* If VEC_SIZE == 64 just branch with KTEST. We have free port0
349 space and it allows the loop to fit in 2x cache lines
350 instead of 3. */
351# if VEC_SIZE == 64
352 KTEST %k1, %k1
353# else
354 KMOV %k1, %VRCX
355 test %VRCX, %VRCX
356# endif
357 jnz L(8x_return_vec_0_1_2_3)
358 subq $-(VEC_SIZE * 4), %rdi
359 cmpq %rdx, %rdi
360 jb L(loop_4x_vec)
361 subq %rdx, %rdi
362 /* rdi has 4 * VEC_SIZE - remaining length. */
363 cmpl $(VEC_SIZE * 3), %edi
364 jge L(8x_last_1x_vec)
365 /* Load regardless of branch. */
366 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
367
368 /* Separate logic as we can only use testb for VEC_SIZE == 64.
369 */
370# if VEC_SIZE == 64
371 testb %dil, %dil
372 js L(8x_last_2x_vec)
373# else
374 cmpl $(VEC_SIZE * 2), %edi
375 jge L(8x_last_2x_vec)
376# endif
377
378 vpxorq (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3)
379
380 VMOVU (%rsi, %rdx), %VMM(1)
381 vpxorq (%rdx), %VMM(1), %VMM(1)
382
383 VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
384 vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
385 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
386 vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4)
387 vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
388 VPTEST %VMM(4), %VMM(4), %k1
389 /* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in
390 `k1` if VEC_SIZE == 64. */
391# if VEC_SIZE == 64
392 KTEST %k1, %k1
393# else
394 KMOV %k1, %VRCX
395 test %VRCX, %VRCX
396# endif
397 jnz L(8x_end_return_vec_0_1_2_3)
398 /* NB: eax must be zero to reach here. */
399 ret
400
401 /* Only entry is from L(more_8x_vec). */
402 .p2align 4,, 6
403L(8x_last_2x_vec):
404 VPCMP $4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1
405 KMOV %k1, %VRAX
406 test %VRAX, %VRAX
407 jnz L(8x_return_vec_2)
408 .p2align 4,, 5
409L(8x_last_1x_vec):
410 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
411 VPCMP $4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1
412 KMOV %k1, %VRAX
413 test %VRAX, %VRAX
414 jnz L(8x_return_vec_3)
415 ret
416
417 /* Not ideally aligned (at offset +9 bytes in fetch block) but
418 not aligning keeps it in the same cache line as
419 L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
420 size. */
421 .p2align 4,, 4
422L(8x_return_vec_2):
423 subq $VEC_SIZE, %rdx
424L(8x_return_vec_3):
425 bsf %VRAX, %VRAX
426# ifdef USE_AS_WMEMCMP
427 leaq (%rdx, %rax, CHAR_SIZE), %rax
428 movl (VEC_SIZE * 3)(%rax), %ecx
429 xorl %edx, %edx
430 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
431 setg %dl
432 leal -1(%rdx, %rdx), %eax
433# else
434 addq %rdx, %rax
435 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
436 movzbl (VEC_SIZE * 3)(%rax), %eax
437 subl %ecx, %eax
438# endif
439 ret
440
441 .p2align 4,, 8
442L(last_2x_vec):
443 /* Check second to last VEC. */
444 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
445 VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
446 KMOV %k1, %VRAX
447 test %VRAX, %VRAX
448 jnz L(return_vec_1_end)
449
450 /* Check last VEC. */
451 .p2align 4,, 8
452L(last_1x_vec):
453 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
454 VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
455 KMOV %k1, %VRAX
456 test %VRAX, %VRAX
457 jnz L(return_vec_0_end)
458 ret
459
460
461 /* Don't fully align. Takes 2-fetch blocks either way and
462 aligning will cause code to spill into another cacheline.
463 */
464 .p2align 4,, 3
465L(return_vec_1_end):
466 /* Use bsf to save code size. This is necessary to have
467 L(one_or_less) fit in aligning bytes between. */
468 bsf %VRAX, %VRAX
469 addl %edx, %eax
470# ifdef USE_AS_WMEMCMP
471 movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
472 xorl %edx, %edx
473 cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
474 setg %dl
475 leal -1(%rdx, %rdx), %eax
476# else
477 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
478 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
479 subl %ecx, %eax
480# endif
481 ret
482
483 .p2align 4,, 2
484 /* Don't align. Takes 2-fetch blocks either way and aligning
485 will cause code to spill into another cacheline. */
486L(return_vec_0_end):
487 bsf %VRAX, %VRAX
488 addl %edx, %eax
489# ifdef USE_AS_WMEMCMP
490 movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
491 xorl %edx, %edx
492 cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
493 setg %dl
494 leal -1(%rdx, %rdx), %eax
495# else
496 movzbl -VEC_SIZE(%rsi, %rax), %ecx
497 movzbl -VEC_SIZE(%rdi, %rax), %eax
498 subl %ecx, %eax
499# endif
500 ret
501 /* evex256: 2-byte until next cache line. evex512: 46-bytes
502 until next cache line. */
503END (MEMCMP)
504#endif
505

source code of glibc/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S