memcmp.S source code [glibc/sysdeps/x86_64/memcmp.S]

1	/ memcmp with SSE2*
2	Copyright (C) 2009-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	#ifdef USE_AS_WMEMCMP
22	# define PCMPEQ pcmpeqd
23	# define CHAR_SIZE 4
24	# define SIZE_OFFSET (0)
25	#else
26	# define PCMPEQ pcmpeqb
27	# define CHAR_SIZE 1
28	#endif
29
30	#ifdef USE_AS_MEMCMPEQ
31	# define SIZE_OFFSET (0)
32	# define CHECK_CMP(x, y) subl x, y
33	#else
34	# ifndef SIZE_OFFSET
35	# define SIZE_OFFSET (CHAR_PER_VEC * 2)
36	# endif
37	# define CHECK_CMP(x, y) cmpl x, y
38	#endif
39
40	#define VEC_SIZE 16
41	#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
42
43	#ifndef MEMCMP
44	# define MEMCMP memcmp
45	#endif
46
47	.text
48	ENTRY(MEMCMP)
49	# ifdef __ILP32__
50	/ Clear the upper 32 bits. /
51	movl %edx, %edx
52	# endif
53	#ifdef USE_AS_WMEMCMP
54	/ Use 0xffff to test for mismatches on pmovmskb bitmask. Store*
55	in ecx for code size. This is preferable to using `incw` as
56	it avoids partial register stalls on older hardware (pre
57	SnB). /*
58	movl $`0xffff`, %ecx
59	#endif
60	cmpq $CHAR_PER_VEC, %rdx
61	ja L(more_1x_vec)
62
63	#ifdef USE_AS_WMEMCMP
64	/ saves a byte of code keeping the fall through path n = [2, 4]*
65	in the initial cache line. /*
66	decl %edx
67	jle L(cmp_0_1)
68
69	movq (%rsi), %xmm0
70	movq (%rdi), %xmm1
71	PCMPEQ %xmm0, %xmm1
72	pmovmskb %xmm1, %eax
73	subl %ecx, %eax
74	jnz L(ret_nonzero_vec_start_0)
75
76	movq -`4`(%rsi, %rdx, CHAR_SIZE), %xmm0
77	movq -`4`(%rdi, %rdx, CHAR_SIZE), %xmm1
78	PCMPEQ %xmm0, %xmm1
79	pmovmskb %xmm1, %eax
80	subl %ecx, %eax
81	jnz L(ret_nonzero_vec_end_0_adj)
82	#else
83	cmpl $`8`, %edx
84	ja L(cmp_9_16)
85
86	cmpl $`4`, %edx
87	jb L(cmp_0_3)
88
89	# ifdef USE_AS_MEMCMPEQ
90	movl (%rsi), %eax
91	subl (%rdi), %eax
92
93	movl -`4`(%rsi, %rdx), %esi
94	subl -`4`(%rdi, %rdx), %esi
95
96	orl %esi, %eax
97	ret
98	# else
99	/ Combine comparisons for lo and hi 4-byte comparisons. /
100	movl -`4`(%rsi, %rdx), %ecx
101	movl -`4`(%rdi, %rdx), %eax
102	shlq $`32`, %rcx
103	shlq $`32`, %rax
104	movl (%rsi), %esi
105	movl (%rdi), %edi
106	orq %rsi, %rcx
107	orq %rdi, %rax
108	/ Only compute proper return if not-equal. /
109	cmpq %rcx, %rax
110	jnz L(ret_nonzero)
111	xorl %eax, %eax
112	ret
113	# endif
114
115	.p2align `4`,, `10`
116	L(cmp_9_16):
117	# ifdef USE_AS_MEMCMPEQ
118	movq (%rsi), %rax
119	subq (%rdi), %rax
120
121	movq -`8`(%rsi, %rdx), %rcx
122	subq -`8`(%rdi, %rdx), %rcx
123	orq %rcx, %rax
124	/ Convert 64 bit -> 32 bit boolean (we should have made the ABI*
125	return long). /*
126	setnz %cl
127	movzbl %cl, %eax
128	# else
129	movq (%rsi), %rcx
130	movq (%rdi), %rax
131	/ Only compute proper return if not-equal. /
132	cmpq %rcx, %rax
133	jnz L(ret_nonzero)
134
135	movq -`8`(%rsi, %rdx, CHAR_SIZE), %rcx
136	movq -`8`(%rdi, %rdx, CHAR_SIZE), %rax
137	/ Only compute proper return if not-equal. /
138	cmpq %rcx, %rax
139	jnz L(ret_nonzero)
140	xorl %eax, %eax
141	# endif
142	#endif
143	ret
144
145	.p2align `4`,, `8`
146	L(cmp_0_1):
147	/ Flag set by earlier comparison against 1. /
148	jne L(cmp_0_0)
149	#ifdef USE_AS_WMEMCMP
150	movl (%rdi), %ecx
151	xorl %edx, %edx
152	cmpl (%rsi), %ecx
153	je L(cmp_0_0)
154	setg %dl
155	leal -`1`(%rdx, %rdx), %eax
156	#else
157	movzbl (%rdi), %eax
158	movzbl (%rsi), %ecx
159	subl %ecx, %eax
160	#endif
161	ret
162
163	/ Fits in aligning bytes. /
164	L(cmp_0_0):
165	xorl %eax, %eax
166	ret
167
168	#ifdef USE_AS_WMEMCMP
169	.p2align `4`
170	L(ret_nonzero_vec_start_0):
171	bsfl %eax, %eax
172	movl (%rdi, %rax), %ecx
173	xorl %edx, %edx
174	cmpl (%rsi, %rax), %ecx
175	/ NB: no partial register stall here because xorl zero idiom*
176	above. /*
177	setg %dl
178	leal -`1`(%rdx, %rdx), %eax
179	ret
180	#else
181
182	# ifndef USE_AS_MEMCMPEQ
183	.p2align `4`,, `14`
184	L(ret_nonzero):
185	/ Need to bswap to get proper return without branch. /
186	bswapq %rcx
187	bswapq %rax
188	subq %rcx, %rax
189	sbbl %eax, %eax
190	orl $`1`, %eax
191	ret
192	# endif
193
194	.p2align `4`
195	L(cmp_0_3):
196	# ifdef USE_AS_MEMCMPEQ
197	/ No reason to add to dependency chain on rdx. Saving a the*
198	bytes here doesn't change number of fetch blocks. /*
199	cmpl $`1`, %edx
200	jbe L(cmp_0_1)
201	# else
202	/ We need the code size to prevent taking an extra fetch block.*
203	*/
204	decl %edx
205	jle L(cmp_0_1)
206	# endif
207	movzwl (%rsi), %ecx
208	movzwl (%rdi), %eax
209
210	# ifdef USE_AS_MEMCMPEQ
211	subl %ecx, %eax
212
213	movzbl -`1`(%rsi, %rdx), %esi
214	movzbl -`1`(%rdi, %rdx), %edi
215	subl %edi, %esi
216	orl %esi, %eax
217	# else
218	bswapl %ecx
219	bswapl %eax
220
221	/ Implicit right shift by one. We just need to displace the*
222	sign bits. /*
223	shrl %ecx
224	shrl %eax
225
226	/ Eat a partial register stall here. Saves code stopping*
227	L(cmp_0_3) from bleeding into the next fetch block and saves
228	an ALU. /*
229	movb (%rsi, %rdx), %cl
230	movzbl (%rdi, %rdx), %edi
231	orl %edi, %eax
232	subl %ecx, %eax
233	# endif
234	ret
235	#endif
236
237	.p2align `5`
238	L(more_1x_vec):
239	#ifndef USE_AS_WMEMCMP
240	/ Use 0xffff to test for mismatches on pmovmskb bitmask. Store*
241	in ecx for code size. This is preferable to using `incw` as
242	it avoids partial register stalls on older hardware (pre
243	SnB). /*
244	movl $`0xffff`, %ecx
245	#endif
246	movups (%rsi), %xmm0
247	movups (%rdi), %xmm1
248	PCMPEQ %xmm0, %xmm1
249	pmovmskb %xmm1, %eax
250	subl %ecx, %eax
251	jnz L(ret_nonzero_vec_start_0)
252	#if SIZE_OFFSET == 0
253	cmpq $(CHAR_PER_VEC * `2`), %rdx
254	#else
255	/ Offset rdx. Saves just enough code size to keep the*
256	L(last_2x_vec) case and the non-zero return in a single
257	cache line. /*
258	subq $(CHAR_PER_VEC * `2`), %rdx
259	#endif
260	ja L(more_2x_vec)
261
262	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
263	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
264	PCMPEQ %xmm0, %xmm1
265	pmovmskb %xmm1, %eax
266	subl %ecx, %eax
267	#ifndef USE_AS_MEMCMPEQ
268	/ Don't use `incw ax` as machines this code runs on are liable*
269	to have partial register stall. /*
270	jnz L(ret_nonzero_vec_end_0)
271	#else
272	/ Various return targets for memcmpeq. Will always be hot in*
273	Icache and get short encoding. /*
274	L(ret_nonzero_vec_start_1):
275	L(ret_nonzero_vec_start_0):
276	L(ret_nonzero_vec_end_0):
277	#endif
278	ret
279
280	#ifndef USE_AS_MEMCMPEQ
281	# ifdef USE_AS_WMEMCMP
282	.p2align `4`
283	L(ret_nonzero_vec_end_0_adj):
284	addl $`3`, %edx
285	# else
286	.p2align `4`,, `8`
287	# endif
288	L(ret_nonzero_vec_end_0):
289	bsfl %eax, %eax
290	# ifdef USE_AS_WMEMCMP
291	leal (%rax, %rdx, CHAR_SIZE), %eax
292	movl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rax), %ecx
293	xorl %edx, %edx
294	cmpl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rax), %ecx
295	/ NB: no partial register stall here because xorl zero idiom*
296	above. /*
297	setg %dl
298	leal -`1`(%rdx, %rdx), %eax
299	# else
300	/ Use `addq` instead of `addl` here so that even if `rax` + `rdx`*
301	is negative value of the sum will be usable as a 64-bit offset
302	(negative 32-bit numbers zero-extend to a large and often
303	out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is
304	an invariant when `memcmp` is used correctly, but if the input
305	strings `rsi`/`rdi` are concurrently modified as the function
306	runs (there is a Data-Race) it is possible for `rax` + `rdx` to
307	be negative. Given that there is virtually no extra to cost
308	using `addq` instead of `addl` we may as well protect the
309	data-race case. /*
310	addq %rdx, %rax
311	movzbl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rax), %ecx
312	movzbl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rax), %eax
313	subl %ecx, %eax
314	# endif
315	ret
316	# ifndef USE_AS_WMEMCMP
317	.p2align `4`,, `10`
318	L(ret_nonzero_vec_start_0):
319	bsfl %eax, %eax
320	movzbl (%rsi, %rax), %ecx
321	movzbl (%rdi, %rax), %eax
322	subl %ecx, %eax
323	ret
324	# endif
325	#else
326	#endif
327
328	.p2align `5`
329	L(more_2x_vec):
330	movups (VEC_SIZE * `1`)(%rsi), %xmm0
331	movups (VEC_SIZE * `1`)(%rdi), %xmm1
332	PCMPEQ %xmm0, %xmm1
333	pmovmskb %xmm1, %eax
334	subl %ecx, %eax
335	jnz L(ret_nonzero_vec_start_1)
336
337	cmpq $(CHAR_PER_VEC * `4` - SIZE_OFFSET), %rdx
338	jbe L(last_2x_vec)
339
340	cmpq $(CHAR_PER_VEC * `8` - SIZE_OFFSET), %rdx
341	ja L(more_8x_vec)
342
343	/ Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.*
344	This can harm performance if non-zero return in [65, 80] or
345	[97, 112] but helps performance otherwise. Generally zero-
346	return is hotter. /*
347	movups (VEC_SIZE * `2`)(%rsi), %xmm0
348	movups (VEC_SIZE * `2`)(%rdi), %xmm1
349	PCMPEQ %xmm0, %xmm1
350	movups (VEC_SIZE * `3`)(%rsi), %xmm2
351	movups (VEC_SIZE * `3`)(%rdi), %xmm3
352	PCMPEQ %xmm2, %xmm3
353	pand %xmm1, %xmm3
354
355	pmovmskb %xmm3, %eax
356	CHECK_CMP (%ecx, %eax)
357	jnz L(ret_nonzero_vec_start_2_3)
358
359	cmpl $(CHAR_PER_VEC * `6` - SIZE_OFFSET), %edx
360	jbe L(last_2x_vec)
361
362	movups (VEC_SIZE * `4`)(%rsi), %xmm0
363	movups (VEC_SIZE * `4`)(%rdi), %xmm1
364	PCMPEQ %xmm0, %xmm1
365	movups (VEC_SIZE * `5`)(%rsi), %xmm2
366	movups (VEC_SIZE * `5`)(%rdi), %xmm3
367	PCMPEQ %xmm2, %xmm3
368	pand %xmm1, %xmm3
369
370	pmovmskb %xmm3, %eax
371	CHECK_CMP (%ecx, %eax)
372	#ifdef USE_AS_MEMCMPEQ
373	jz L(last_2x_vec)
374	ret
375	#else
376	jnz L(ret_nonzero_vec_start_4_5)
377	#endif
378	.p2align `4`
379	L(last_2x_vec):
380	movups (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
381	movups (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
382	PCMPEQ %xmm0, %xmm1
383	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
384	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
385	PCMPEQ %xmm2, %xmm3
386	pand %xmm1, %xmm3
387	pmovmskb %xmm3, %eax
388	subl %ecx, %eax
389	#ifdef USE_AS_MEMCMPEQ
390	/ Various return targets for memcmpeq. Will always be hot in*
391	Icache and get short encoding. /*
392	L(ret_nonzero_vec_start_2_3):
393	L(ret_nonzero_vec_start_4_5):
394	ret
395	#else
396	jnz L(ret_nonzero_vec_end_1)
397	ret
398
399	.p2align `4`,, `8`
400	L(ret_nonzero_vec_end_1):
401	pmovmskb %xmm1, %ecx
402	/ High 16 bits of eax guranteed to be all ones. Rotate them in*
403	to we can do `or + not` with just `xor`. /*
404	rorl $`16`, %eax
405	xorl %ecx, %eax
406	/ Partial register stall. /
407
408	bsfl %eax, %eax
409	# ifdef USE_AS_WMEMCMP
410	leal (%rax, %rdx, CHAR_SIZE), %eax
411	movl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rax), %ecx
412	xorl %edx, %edx
413	cmpl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rax), %ecx
414	/ NB: no partial register stall here because xorl zero idiom*
415	above. /*
416	setg %dl
417	leal -`1`(%rdx, %rdx), %eax
418	# else
419	addl %edx, %eax
420	movzbl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rax), %ecx
421	movzbl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rax), %eax
422	subl %ecx, %eax
423	# endif
424	ret
425
426	.p2align `4`
427	L(ret_nonzero_vec_start_4_5):
428	pmovmskb %xmm1, %edx
429	sall $`16`, %eax
430	leal `1`(%rax, %rdx), %eax
431	bsfl %eax, %eax
432	# ifdef USE_AS_WMEMCMP
433	movl (VEC_SIZE * `4`)(%rdi, %rax), %ecx
434	xorl %edx, %edx
435	cmpl (VEC_SIZE * `4`)(%rsi, %rax), %ecx
436	/ NB: no partial register stall here because xorl zero idiom*
437	above. /*
438	setg %dl
439	leal -`1`(%rdx, %rdx), %eax
440	# else
441	movzbl (VEC_SIZE * `4`)(%rsi, %rax), %ecx
442	movzbl (VEC_SIZE * `4`)(%rdi, %rax), %eax
443	subl %ecx, %eax
444	# endif
445	ret
446
447	.p2align `4`,, `8`
448	L(ret_nonzero_vec_start_1):
449	bsfl %eax, %eax
450	# ifdef USE_AS_WMEMCMP
451	movl (VEC_SIZE * `1`)(%rdi, %rax), %ecx
452	xorl %edx, %edx
453	cmpl (VEC_SIZE * `1`)(%rsi, %rax), %ecx
454	/ NB: no partial register stall here because xorl zero idiom*
455	above. /*
456	setg %dl
457	leal -`1`(%rdx, %rdx), %eax
458	# else
459	movzbl (VEC_SIZE * `1`)(%rsi, %rax), %ecx
460	movzbl (VEC_SIZE * `1`)(%rdi, %rax), %eax
461	subl %ecx, %eax
462	# endif
463	ret
464	#endif
465
466	.p2align `4`
467	L(more_8x_vec):
468	subq %rdi, %rsi
469	leaq (VEC_SIZE * -`6` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
470	andq $(VEC_SIZE * -`1`), %rdi
471	addq %rdi, %rsi
472	.p2align `4`
473	L(loop_4x):
474	movups (VEC_SIZE * `2`)(%rsi), %xmm0
475	movups (VEC_SIZE * `3`)(%rsi), %xmm1
476
477	PCMPEQ (VEC_SIZE * `2`)(%rdi), %xmm0
478	PCMPEQ (VEC_SIZE * `3`)(%rdi), %xmm1
479
480	movups (VEC_SIZE * `4`)(%rsi), %xmm2
481	movups (VEC_SIZE * `5`)(%rsi), %xmm3
482
483	PCMPEQ (VEC_SIZE * `4`)(%rdi), %xmm2
484	PCMPEQ (VEC_SIZE * `5`)(%rdi), %xmm3
485
486	pand %xmm0, %xmm1
487	pand %xmm2, %xmm3
488	pand %xmm1, %xmm3
489
490	pmovmskb %xmm3, %eax
491	subl %ecx, %eax
492	jnz L(ret_nonzero_loop)
493
494	addq $(VEC_SIZE * `4`), %rdi
495	addq $(VEC_SIZE * `4`), %rsi
496	cmpq %rdi, %rdx
497	ja L(loop_4x)
498	/ Get remaining length in edx. /
499	subl %edi, %edx
500	/ Restore offset so we can reuse L(last_2x_vec). /
501	addl $(VEC_SIZE * `6` - SIZE_OFFSET), %edx
502	#ifdef USE_AS_WMEMCMP
503	shrl $`2`, %edx
504	#endif
505	cmpl $(CHAR_PER_VEC * `4` - SIZE_OFFSET), %edx
506	jbe L(last_2x_vec)
507
508
509	movups (VEC_SIZE * `2`)(%rsi), %xmm0
510	movups (VEC_SIZE * `2`)(%rdi), %xmm1
511	PCMPEQ %xmm0, %xmm1
512	movups (VEC_SIZE * `3`)(%rsi), %xmm2
513	movups (VEC_SIZE * `3`)(%rdi), %xmm3
514	PCMPEQ %xmm2, %xmm3
515	pand %xmm1, %xmm3
516
517	pmovmskb %xmm3, %eax
518	CHECK_CMP (%ecx, %eax)
519	jz L(last_2x_vec)
520	#ifdef USE_AS_MEMCMPEQ
521	L(ret_nonzero_loop):
522	ret
523	#else
524
525	.p2align `4`
526	L(ret_nonzero_vec_start_2_3):
527	pmovmskb %xmm1, %edx
528	sall $`16`, %eax
529	leal `1`(%rax, %rdx), %eax
530
531	bsfl %eax, %eax
532	# ifdef USE_AS_WMEMCMP
533	movl (VEC_SIZE * `2`)(%rdi, %rax), %ecx
534	xorl %edx, %edx
535	cmpl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
536	/ NB: no partial register stall here because xorl zero idiom*
537	above. /*
538	setg %dl
539	leal -`1`(%rdx, %rdx), %eax
540	# else
541	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
542	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
543	subl %ecx, %eax
544	# endif
545	ret
546
547	.p2align `4`
548	L(ret_nonzero_loop):
549	pmovmskb %xmm0, %ecx
550	pmovmskb %xmm1, %edx
551	sall $(VEC_SIZE * `1`), %edx
552	leal `1`(%rcx, %rdx), %edx
553	pmovmskb %xmm2, %ecx
554	/ High 16 bits of eax guranteed to be all ones. Rotate them in*
555	to we can do `or + not` with just `xor`. /*
556	rorl $`16`, %eax
557	xorl %ecx, %eax
558
559	salq $`32`, %rax
560	orq %rdx, %rax
561
562	bsfq %rax, %rax
563	# ifdef USE_AS_WMEMCMP
564	movl (VEC_SIZE * `2`)(%rdi, %rax), %ecx
565	xorl %edx, %edx
566	cmpl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
567	/ NB: no partial register stall here because xorl zero idiom*
568	above. /*
569	setg %dl
570	leal -`1`(%rdx, %rdx), %eax
571	# else
572	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
573	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
574	subl %ecx, %eax
575	# endif
576	ret
577	#endif
578	END(MEMCMP)
579
580	#ifndef USE_AS_WMEMCMP
581	# ifdef USE_AS_MEMCMPEQ
582	libc_hidden_def (MEMCMP)
583	# else
584	# undef bcmp
585	weak_alias (MEMCMP, bcmp)
586	libc_hidden_builtin_def (MEMCMP)
587	# endif
588	#endif
589

source code of glibc/sysdeps/x86_64/memcmp.S