memcmp-avx2-movbe.S source code [glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S]

1	/ memcmp/wmemcmp optimized with AVX2.*
2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	/ memcmp/wmemcmp is implemented as:*
22	1. Use ymm vector compares when possible. The only case where
23	vector compares is not possible for when size < VEC_SIZE
24	and loading from either s1 or s2 would cause a page cross.
25	2. For size from 2 to 7 bytes on page cross, load as big endian
26	with movbe and bswap to avoid branches.
27	3. Use xmm vector compare when size >= 4 bytes for memcmp or
28	size >= 8 bytes for wmemcmp.
29	4. Optimistically compare up to first 4 VEC_SIZE one at a*
30	to check for early mismatches. Only do this if its guranteed the
31	work is not wasted.
32	5. If size is 8 VEC_SIZE or less, unroll the loop.*
33	6. Compare 4 VEC_SIZE at a time with the aligned first memory*
34	area.
35	7. Use 2 vector compares when size is 2 VEC_SIZE or less.*
36	8. Use 4 vector compares when size is 4 VEC_SIZE or less.*
37	9. Use 8 vector compares when size is 8 VEC_SIZE or less. /
38
39
40	# include <sysdep.h>
41
42	# ifndef MEMCMP
43	# define MEMCMP __memcmp_avx2_movbe
44	# endif
45
46	# ifdef USE_AS_WMEMCMP
47	# define CHAR_SIZE 4
48	# define VPCMPEQ vpcmpeqd
49	# else
50	# define CHAR_SIZE 1
51	# define VPCMPEQ vpcmpeqb
52	# endif
53
54	# ifndef VZEROUPPER
55	# define VZEROUPPER vzeroupper
56	# endif
57
58	# ifndef SECTION
59	# define SECTION(p) p##.avx
60	# endif
61
62	# define VEC_SIZE 32
63	# define PAGE_SIZE 4096
64
65	/ Warning!*
66	wmemcmp has to use SIGNED comparison for elements.
67	memcmp has to use UNSIGNED comparison for elemnts.
68	*/
69
70	.section SECTION(.text),"ax",@progbits
71	ENTRY (MEMCMP)
72	# ifdef USE_AS_WMEMCMP
73	shl $`2`, %RDX_LP
74	# elif defined __ILP32__
75	/ Clear the upper 32 bits. /
76	movl %edx, %edx
77	# endif
78	cmp $VEC_SIZE, %RDX_LP
79	jb L(less_vec)
80
81	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
82	vmovdqu (%rsi), %ymm1
83	VPCMPEQ (%rdi), %ymm1, %ymm1
84	vpmovmskb %ymm1, %eax
85	/ NB: eax must be destination register if going to*
86	L(return_vec_[0,2]). For L(return_vec_3 destination register
87	must be ecx. /*
88	incl %eax
89	jnz L(return_vec_0)
90
91	cmpq $(VEC_SIZE * `2`), %rdx
92	jbe L(last_1x_vec)
93
94	/ Check second VEC no matter what. /
95	vmovdqu VEC_SIZE(%rsi), %ymm2
96	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
97	vpmovmskb %ymm2, %eax
98	/ If all 4 VEC where equal eax will be all 1s so incl will*
99	overflow and set zero flag. /*
100	incl %eax
101	jnz L(return_vec_1)
102
103	/ Less than 4 * VEC. /
104	cmpq $(VEC_SIZE * `4`), %rdx
105	jbe L(last_2x_vec)
106
107	/ Check third and fourth VEC no matter what. /
108	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
109	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
110	vpmovmskb %ymm3, %eax
111	incl %eax
112	jnz L(return_vec_2)
113	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
114	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
115	vpmovmskb %ymm4, %ecx
116	incl %ecx
117	jnz L(return_vec_3)
118
119	/ Go to 4x VEC loop. /
120	cmpq $(VEC_SIZE * `8`), %rdx
121	ja L(more_8x_vec)
122
123	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
124	branches. /*
125
126	/ Load first two VEC from s2 before adjusting addresses. /
127	vmovdqu -(VEC_SIZE * `4`)(%rsi, %rdx), %ymm1
128	vmovdqu -(VEC_SIZE * `3`)(%rsi, %rdx), %ymm2
129	leaq -(`4` * VEC_SIZE)(%rdi, %rdx), %rdi
130	leaq -(`4` * VEC_SIZE)(%rsi, %rdx), %rsi
131
132	/ Wait to load from s1 until addressed adjust due to*
133	unlamination of microfusion with complex address mode. /*
134	VPCMPEQ (%rdi), %ymm1, %ymm1
135	VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
136
137	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
138	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
139	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
140	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
141
142	/ Reduce VEC0 - VEC4. /
143	vpand %ymm1, %ymm2, %ymm5
144	vpand %ymm3, %ymm4, %ymm6
145	vpand %ymm5, %ymm6, %ymm7
146	vpmovmskb %ymm7, %ecx
147	incl %ecx
148	jnz L(return_vec_0_1_2_3)
149	/ NB: eax must be zero to reach here. /
150	VZEROUPPER_RETURN
151
152	.p2align `4`
153	L(return_vec_0):
154	tzcntl %eax, %eax
155	# ifdef USE_AS_WMEMCMP
156	movl (%rdi, %rax), %ecx
157	xorl %edx, %edx
158	cmpl (%rsi, %rax), %ecx
159	/ NB: no partial register stall here because xorl zero idiom*
160	above. /*
161	setg %dl
162	leal -`1`(%rdx, %rdx), %eax
163	# else
164	movzbl (%rsi, %rax), %ecx
165	movzbl (%rdi, %rax), %eax
166	subl %ecx, %eax
167	# endif
168	L(return_vzeroupper):
169	ZERO_UPPER_VEC_REGISTERS_RETURN
170
171	.p2align `4`
172	L(return_vec_1):
173	tzcntl %eax, %eax
174	# ifdef USE_AS_WMEMCMP
175	movl VEC_SIZE(%rdi, %rax), %ecx
176	xorl %edx, %edx
177	cmpl VEC_SIZE(%rsi, %rax), %ecx
178	setg %dl
179	leal -`1`(%rdx, %rdx), %eax
180	# else
181	movzbl VEC_SIZE(%rsi, %rax), %ecx
182	movzbl VEC_SIZE(%rdi, %rax), %eax
183	subl %ecx, %eax
184	# endif
185	VZEROUPPER_RETURN
186
187	.p2align `4`
188	L(return_vec_2):
189	tzcntl %eax, %eax
190	# ifdef USE_AS_WMEMCMP
191	movl (VEC_SIZE * `2`)(%rdi, %rax), %ecx
192	xorl %edx, %edx
193	cmpl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
194	setg %dl
195	leal -`1`(%rdx, %rdx), %eax
196	# else
197	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
198	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
199	subl %ecx, %eax
200	# endif
201	VZEROUPPER_RETURN
202
203	/ NB: p2align 5 here to ensure 4x loop is 32 byte aligned. /
204	.p2align `5`
205	L(`8x_return_vec_0_1_2_3`):
206	/ Returning from L(more_8x_vec) requires restoring rsi. /
207	addq %rdi, %rsi
208	L(return_vec_0_1_2_3):
209	vpmovmskb %ymm1, %eax
210	incl %eax
211	jnz L(return_vec_0)
212
213	vpmovmskb %ymm2, %eax
214	incl %eax
215	jnz L(return_vec_1)
216
217	vpmovmskb %ymm3, %eax
218	incl %eax
219	jnz L(return_vec_2)
220	L(return_vec_3):
221	tzcntl %ecx, %ecx
222	# ifdef USE_AS_WMEMCMP
223	movl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
224	xorl %edx, %edx
225	cmpl (VEC_SIZE * `3`)(%rsi, %rcx), %eax
226	setg %dl
227	leal -`1`(%rdx, %rdx), %eax
228	# else
229	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
230	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %ecx
231	subl %ecx, %eax
232	# endif
233	VZEROUPPER_RETURN
234
235	.p2align `4`
236	L(more_8x_vec):
237	/ Set end of s1 in rdx. /
238	leaq -(VEC_SIZE * `4`)(%rdi, %rdx), %rdx
239	/ rsi stores s2 - s1. This allows loop to only update one*
240	pointer. /*
241	subq %rdi, %rsi
242	/ Align s1 pointer. /
243	andq $-VEC_SIZE, %rdi
244	/ Adjust because first 4x vec where check already. /
245	subq $-(VEC_SIZE * `4`), %rdi
246	.p2align `4`
247	L(loop_4x_vec):
248	/ rsi has s2 - s1 so get correct address by adding s1 (in rdi).*
249	*/
250	vmovdqu (%rsi, %rdi), %ymm1
251	VPCMPEQ (%rdi), %ymm1, %ymm1
252
253	vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
254	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
255
256	vmovdqu (VEC_SIZE * `2`)(%rsi, %rdi), %ymm3
257	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
258
259	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdi), %ymm4
260	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
261
262	vpand %ymm1, %ymm2, %ymm5
263	vpand %ymm3, %ymm4, %ymm6
264	vpand %ymm5, %ymm6, %ymm7
265	vpmovmskb %ymm7, %ecx
266	incl %ecx
267	jnz L(`8x_return_vec_0_1_2_3`)
268	subq $-(VEC_SIZE * `4`), %rdi
269	/ Check if s1 pointer at end. /
270	cmpq %rdx, %rdi
271	jb L(loop_4x_vec)
272
273	subq %rdx, %rdi
274	/ rdi has 4 * VEC_SIZE - remaining length. /
275	cmpl $(VEC_SIZE * `3`), %edi
276	jae L(`8x_last_1x_vec`)
277	/ Load regardless of branch. /
278	vmovdqu (VEC_SIZE * `2`)(%rsi, %rdx), %ymm3
279	cmpl $(VEC_SIZE * `2`), %edi
280	jae L(`8x_last_2x_vec`)
281
282	/ Check last 4 VEC. /
283	vmovdqu (%rsi, %rdx), %ymm1
284	VPCMPEQ (%rdx), %ymm1, %ymm1
285
286	vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
287	VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
288
289	VPCMPEQ (VEC_SIZE * `2`)(%rdx), %ymm3, %ymm3
290
291	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdx), %ymm4
292	VPCMPEQ (VEC_SIZE * `3`)(%rdx), %ymm4, %ymm4
293
294	vpand %ymm1, %ymm2, %ymm5
295	vpand %ymm3, %ymm4, %ymm6
296	vpand %ymm5, %ymm6, %ymm7
297	vpmovmskb %ymm7, %ecx
298	/ Restore s1 pointer to rdi. /
299	movq %rdx, %rdi
300	incl %ecx
301	jnz L(`8x_return_vec_0_1_2_3`)
302	/ NB: eax must be zero to reach here. /
303	VZEROUPPER_RETURN
304
305	/ Only entry is from L(more_8x_vec). /
306	.p2align `4`
307	L(`8x_last_2x_vec`):
308	/ Check second to last VEC. rdx store end pointer of s1 and*
309	ymm3 has already been loaded with second to last VEC from s2.
310	*/
311	VPCMPEQ (VEC_SIZE * `2`)(%rdx), %ymm3, %ymm3
312	vpmovmskb %ymm3, %eax
313	incl %eax
314	jnz L(`8x_return_vec_2`)
315	/ Check last VEC. /
316	.p2align `4`
317	L(`8x_last_1x_vec`):
318	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdx), %ymm4
319	VPCMPEQ (VEC_SIZE * `3`)(%rdx), %ymm4, %ymm4
320	vpmovmskb %ymm4, %eax
321	incl %eax
322	jnz L(`8x_return_vec_3`)
323	VZEROUPPER_RETURN
324
325	.p2align `4`
326	L(last_2x_vec):
327	/ Check second to last VEC. /
328	vmovdqu -(VEC_SIZE * `2`)(%rsi, %rdx), %ymm1
329	VPCMPEQ -(VEC_SIZE * `2`)(%rdi, %rdx), %ymm1, %ymm1
330	vpmovmskb %ymm1, %eax
331	incl %eax
332	jnz L(return_vec_1_end)
333	/ Check last VEC. /
334	L(last_1x_vec):
335	vmovdqu -(VEC_SIZE * `1`)(%rsi, %rdx), %ymm1
336	VPCMPEQ -(VEC_SIZE * `1`)(%rdi, %rdx), %ymm1, %ymm1
337	vpmovmskb %ymm1, %eax
338	incl %eax
339	jnz L(return_vec_0_end)
340	VZEROUPPER_RETURN
341
342	.p2align `4`
343	L(`8x_return_vec_2`):
344	subq $VEC_SIZE, %rdx
345	L(`8x_return_vec_3`):
346	tzcntl %eax, %eax
347	addq %rdx, %rax
348	# ifdef USE_AS_WMEMCMP
349	movl (VEC_SIZE * `3`)(%rax), %ecx
350	xorl %edx, %edx
351	cmpl (VEC_SIZE * `3`)(%rsi, %rax), %ecx
352	setg %dl
353	leal -`1`(%rdx, %rdx), %eax
354	# else
355	movzbl (VEC_SIZE * `3`)(%rsi, %rax), %ecx
356	movzbl (VEC_SIZE * `3`)(%rax), %eax
357	subl %ecx, %eax
358	# endif
359	VZEROUPPER_RETURN
360
361	.p2align `4`
362	L(return_vec_1_end):
363	tzcntl %eax, %eax
364	addl %edx, %eax
365	# ifdef USE_AS_WMEMCMP
366	movl -(VEC_SIZE * `2`)(%rdi, %rax), %ecx
367	xorl %edx, %edx
368	cmpl -(VEC_SIZE * `2`)(%rsi, %rax), %ecx
369	setg %dl
370	leal -`1`(%rdx, %rdx), %eax
371	# else
372	movzbl -(VEC_SIZE * `2`)(%rsi, %rax), %ecx
373	movzbl -(VEC_SIZE * `2`)(%rdi, %rax), %eax
374	subl %ecx, %eax
375	# endif
376	VZEROUPPER_RETURN
377
378	.p2align `4`
379	L(return_vec_0_end):
380	tzcntl %eax, %eax
381	addl %edx, %eax
382	# ifdef USE_AS_WMEMCMP
383	movl -VEC_SIZE(%rdi, %rax), %ecx
384	xorl %edx, %edx
385	cmpl -VEC_SIZE(%rsi, %rax), %ecx
386	setg %dl
387	leal -`1`(%rdx, %rdx), %eax
388	# else
389	movzbl -VEC_SIZE(%rsi, %rax), %ecx
390	movzbl -VEC_SIZE(%rdi, %rax), %eax
391	subl %ecx, %eax
392	# endif
393	VZEROUPPER_RETURN
394
395	.p2align `4`
396	L(less_vec):
397	/ Check if one or less CHAR. This is necessary for size = 0 but*
398	is also faster for size = CHAR_SIZE. /*
399	cmpl $CHAR_SIZE, %edx
400	jbe L(one_or_less)
401
402	/ Check if loading one VEC from either s1 or s2 could cause a*
403	page cross. This can have false positives but is by far the
404	fastest method. /*
405	movl %edi, %eax
406	orl %esi, %eax
407	andl $(PAGE_SIZE - `1`), %eax
408	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
409	jg L(page_cross_less_vec)
410
411	/ No page cross possible. /
412	vmovdqu (%rsi), %ymm2
413	VPCMPEQ (%rdi), %ymm2, %ymm2
414	vpmovmskb %ymm2, %eax
415	incl %eax
416	/ Result will be zero if s1 and s2 match. Otherwise first set*
417	bit will be first mismatch. /*
418	bzhil %edx, %eax, %edx
419	jnz L(return_vec_0)
420	xorl %eax, %eax
421	VZEROUPPER_RETURN
422
423	.p2align `4`
424	L(page_cross_less_vec):
425	/ if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28*
426	bytes. /*
427	cmpl $`16`, %edx
428	jae L(between_16_31)
429	# ifndef USE_AS_WMEMCMP
430	cmpl $`8`, %edx
431	jae L(between_8_15)
432	/ Fall through for [4, 7]. /
433	cmpl $`4`, %edx
434	jb L(between_2_3)
435
436	movbe (%rdi), %eax
437	movbe (%rsi), %ecx
438	shlq $`32`, %rax
439	shlq $`32`, %rcx
440	movbe -`4`(%rdi, %rdx), %edi
441	movbe -`4`(%rsi, %rdx), %esi
442	orq %rdi, %rax
443	orq %rsi, %rcx
444	subq %rcx, %rax
445	/ Fast path for return zero. /
446	jnz L(ret_nonzero)
447	/ No ymm register was touched. /
448	ret
449
450	.p2align `4`
451	L(one_or_less):
452	jb L(zero)
453	movzbl (%rsi), %ecx
454	movzbl (%rdi), %eax
455	subl %ecx, %eax
456	/ No ymm register was touched. /
457	ret
458
459	.p2align `4`,, `5`
460	L(ret_nonzero):
461	sbbl %eax, %eax
462	orl $`1`, %eax
463	/ No ymm register was touched. /
464	ret
465
466	.p2align `4`,, `2`
467	L(zero):
468	xorl %eax, %eax
469	/ No ymm register was touched. /
470	ret
471
472	.p2align `4`
473	L(between_8_15):
474	movbe (%rdi), %rax
475	movbe (%rsi), %rcx
476	subq %rcx, %rax
477	jnz L(ret_nonzero)
478	movbe -`8`(%rdi, %rdx), %rax
479	movbe -`8`(%rsi, %rdx), %rcx
480	subq %rcx, %rax
481	/ Fast path for return zero. /
482	jnz L(ret_nonzero)
483	/ No ymm register was touched. /
484	ret
485	# else
486	/ If USE_AS_WMEMCMP fall through into 8-15 byte case. /
487	vmovq (%rdi), %xmm1
488	vmovq (%rsi), %xmm2
489	VPCMPEQ %xmm1, %xmm2, %xmm2
490	vpmovmskb %xmm2, %eax
491	subl $`0xffff`, %eax
492	jnz L(return_vec_0)
493	/ Use overlapping loads to avoid branches. /
494	leaq -`8`(%rdi, %rdx), %rdi
495	leaq -`8`(%rsi, %rdx), %rsi
496	vmovq (%rdi), %xmm1
497	vmovq (%rsi), %xmm2
498	VPCMPEQ %xmm1, %xmm2, %xmm2
499	vpmovmskb %xmm2, %eax
500	subl $`0xffff`, %eax
501	/ Fast path for return zero. /
502	jnz L(return_vec_0)
503	/ No ymm register was touched. /
504	ret
505	# endif
506
507	.p2align `4`,, `10`
508	L(between_16_31):
509	/ From 16 to 31 bytes. No branch when size == 16. /
510	vmovdqu (%rsi), %xmm2
511	VPCMPEQ (%rdi), %xmm2, %xmm2
512	vpmovmskb %xmm2, %eax
513	subl $`0xffff`, %eax
514	jnz L(return_vec_0)
515
516	/ Use overlapping loads to avoid branches. /
517
518	vmovdqu -`16`(%rsi, %rdx), %xmm2
519	leaq -`16`(%rdi, %rdx), %rdi
520	leaq -`16`(%rsi, %rdx), %rsi
521	VPCMPEQ (%rdi), %xmm2, %xmm2
522	vpmovmskb %xmm2, %eax
523	subl $`0xffff`, %eax
524	/ Fast path for return zero. /
525	jnz L(return_vec_0)
526	/ No ymm register was touched. /
527	ret
528
529	# ifdef USE_AS_WMEMCMP
530	.p2align `4`,, `2`
531	L(zero):
532	xorl %eax, %eax
533	ret
534
535	.p2align `4`
536	L(one_or_less):
537	jb L(zero)
538	movl (%rdi), %ecx
539	xorl %edx, %edx
540	cmpl (%rsi), %ecx
541	je L(zero)
542	setg %dl
543	leal -`1`(%rdx, %rdx), %eax
544	/ No ymm register was touched. /
545	ret
546	# else
547
548	.p2align `4`
549	L(between_2_3):
550	/ Load as big endian to avoid branches. /
551	movzwl (%rdi), %eax
552	movzwl (%rsi), %ecx
553	bswap %eax
554	bswap %ecx
555	shrl %eax
556	shrl %ecx
557	movzbl -`1`(%rdi, %rdx), %edi
558	movzbl -`1`(%rsi, %rdx), %esi
559	orl %edi, %eax
560	orl %esi, %ecx
561	/ Subtraction is okay because the upper bit is zero. /
562	subl %ecx, %eax
563	/ No ymm register was touched. /
564	ret
565	# endif
566
567	END (MEMCMP)
568	#endif
569

source code of glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S