strcmp-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strcmp-avx2.S]

1	/ strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.*
2	Copyright (C) 2018-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	# include <sysdep.h>
22
23	# if defined USE_AS_STRCASECMP_L
24	# include "locale-defines.h"
25	# endif
26
27	# ifndef STRCMP
28	# define STRCMP __strcmp_avx2
29	# endif
30
31	# define PAGE_SIZE 4096
32
33	/ VEC_SIZE = Number of bytes in a ymm register. /
34	# define VEC_SIZE 32
35
36	# define VMOVU vmovdqu
37	# define VMOVA vmovdqa
38
39	# ifdef USE_AS_WCSCMP
40	/ Compare packed dwords. /
41	# define VPCMPEQ vpcmpeqd
42	/ Compare packed dwords and store minimum. /
43	# define VPMINU vpminud
44	/ 1 dword char == 4 bytes. /
45	# define SIZE_OF_CHAR 4
46	# else
47	/ Compare packed bytes. /
48	# define VPCMPEQ vpcmpeqb
49	/ Compare packed bytes and store minimum. /
50	# define VPMINU vpminub
51	/ 1 byte char == 1 byte. /
52	# define SIZE_OF_CHAR 1
53	# endif
54
55	# ifdef USE_AS_STRNCMP
56	# define LOOP_REG r9d
57	# define LOOP_REG64 r9
58
59	# define OFFSET_REG8 r9b
60	# define OFFSET_REG r9d
61	# define OFFSET_REG64 r9
62	# else
63	# define LOOP_REG edx
64	# define LOOP_REG64 rdx
65
66	# define OFFSET_REG8 dl
67	# define OFFSET_REG edx
68	# define OFFSET_REG64 rdx
69	# endif
70
71	# ifndef VZEROUPPER
72	# define VZEROUPPER vzeroupper
73	# endif
74
75	# if defined USE_AS_STRNCMP
76	# define VEC_OFFSET 0
77	# else
78	# define VEC_OFFSET (-VEC_SIZE)
79	# endif
80
81	# ifdef USE_AS_STRCASECMP_L
82	# define BYTE_LOOP_REG OFFSET_REG
83	# else
84	# define BYTE_LOOP_REG ecx
85	# endif
86
87	# ifdef USE_AS_STRCASECMP_L
88	# ifdef USE_AS_STRNCMP
89	# define STRCASECMP __strncasecmp_avx2
90	# define LOCALE_REG rcx
91	# define LOCALE_REG_LP RCX_LP
92	# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
93	# else
94	# define STRCASECMP __strcasecmp_avx2
95	# define LOCALE_REG rdx
96	# define LOCALE_REG_LP RDX_LP
97	# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
98	# endif
99	# endif
100
101	# define xmmZERO xmm15
102	# define ymmZERO ymm15
103
104	# define LCASE_MIN_ymm %ymm10
105	# define LCASE_MAX_ymm %ymm11
106	# define CASE_ADD_ymm %ymm12
107
108	# define LCASE_MIN_xmm %xmm10
109	# define LCASE_MAX_xmm %xmm11
110	# define CASE_ADD_xmm %xmm12
111
112	/ r11 is never use elsewhere so this is safe to maintain. /
113	# define TOLOWER_BASE %r11
114
115	# ifndef SECTION
116	# define SECTION(p) p##.avx
117	# endif
118
119	# ifdef USE_AS_STRCASECMP_L
120	# define REG(x, y) x ## y
121	# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
122	vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
123	vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
124	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
125	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
126	vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
127	vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
128	vpaddb REG(%ext, 8), reg1_in, reg1_out; \
129	vpaddb REG(%ext, 9), reg2_in, reg2_out
130
131	# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
132	# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
133	# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
134
135	# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
136	TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
137	VPCMPEQ scratch_reg, s2_reg, reg_out
138
139	# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
140	VMOVU s2_mem, reg_out; \
141	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
142
143	# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
144	# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
145
146	# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
147	# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
148
149	# else
150	# define TOLOWER_gpr(...)
151	# define TOLOWER_ymm(...)
152	# define TOLOWER_xmm(...)
153
154	# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
155	VPCMPEQ s2_reg, s1_reg, reg_out
156
157	# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
158
159	# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
160	# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
161	# endif
162
163	/ Warning!*
164	wcscmp/wcsncmp have to use SIGNED comparison for elements.
165	strcmp/strncmp have to use UNSIGNED comparison for elements.
166	*/
167
168	/ The main idea of the string comparison (byte or dword) using AVX2*
169	consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
170	either packed bytes or dwords depending on USE_AS_WCSCMP. In order
171	to check the null char, algorithm keeps the matched bytes/dwords,
172	requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
173	the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
174	one VPMINU instructions, together with movdqu and testl instructions.
175	Main loop (away from from page boundary) compares 4 vectors are a time,
176	effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
177
178	The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
179	is the same as strcmp, except that an a maximum offset is tracked. If
180	the maximum offset is reached before a difference is found, zero is
181	returned. /*
182
183	.section SECTION(.text), "ax", @progbits
184	.align `16`
185	.type STRCMP, @function
186	.globl STRCMP
187	.hidden STRCMP
188
189	# ifndef GLABEL
190	# define GLABEL(...) __VA_ARGS__
191	# endif
192
193	# ifdef USE_AS_STRCASECMP_L
194	ENTRY (GLABEL(STRCASECMP))
195	movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
196	mov %fs:(%rax), %LOCALE_REG_LP
197
198	/ Either 1 or 5 bytes (dependeing if CET is enabled). /
199	.p2align `4`
200	END (GLABEL(STRCASECMP))
201	/ FALLTHROUGH to strcasecmp/strncasecmp_l. /
202	# endif
203
204	.p2align `4`
205	STRCMP:
206	cfi_startproc
207	_CET_ENDBR
208	CALL_MCOUNT
209
210	# if defined USE_AS_STRCASECMP_L
211	/ We have to fall back on the C implementation for locales with*
212	encodings not matching ASCII for single bytes. /*
213	# if LOCALE_T___LOCALES != 0 \|\| LC_CTYPE != 0
214	mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
215	# else
216	mov (%LOCALE_REG), %RAX_LP
217	# endif
218	testl $`1`, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
219	jne STRCASECMP_NONASCII
220	leaq _nl_C_LC_CTYPE_tolower + `128` * `4`(%rip), TOLOWER_BASE
221	# endif
222
223	# ifdef USE_AS_STRNCMP
224	/ Don't overwrite LOCALE_REG (rcx) until we have pass*
225	L(one_or_less). Otherwise we might use the wrong locale in
226	the OVERFLOW_STRCMP (strcasecmp_l). /*
227	# ifdef __ILP32__
228	/ Clear the upper 32 bits. /
229	movl %edx, %edx
230	# endif
231	cmp $`1`, %RDX_LP
232	/ Signed comparison intentional. We use this branch to also*
233	test cases where length >= 2^63. These very large sizes can be
234	handled with strcmp as there is no way for that length to
235	actually bound the buffer. /*
236	jle L(one_or_less)
237	# ifdef USE_AS_WCSCMP
238	movq %rdx, %rcx
239
240	/ Multiplying length by sizeof(wchar_t) can result in overflow.*
241	Check if that is possible. All cases where overflow are possible
242	are cases where length is large enough that it can never be a
243	bound on valid memory so just use wcscmp. /*
244	shrq $`56`, %rcx
245	jnz OVERFLOW_STRCMP
246
247	leaq (, %rdx, `4`), %rdx
248	# endif
249	# endif
250	vpxor %xmmZERO, %xmmZERO, %xmmZERO
251	# if defined USE_AS_STRCASECMP_L
252	.section .rodata.cst32, "aM", @progbits, `32`
253	.align `32`
254	L(lcase_min):
255	.quad `0x3f3f3f3f3f3f3f3f`
256	.quad `0x3f3f3f3f3f3f3f3f`
257	.quad `0x3f3f3f3f3f3f3f3f`
258	.quad `0x3f3f3f3f3f3f3f3f`
259	L(lcase_max):
260	.quad `0x9999999999999999`
261	.quad `0x9999999999999999`
262	.quad `0x9999999999999999`
263	.quad `0x9999999999999999`
264	L(case_add):
265	.quad `0x2020202020202020`
266	.quad `0x2020202020202020`
267	.quad `0x2020202020202020`
268	.quad `0x2020202020202020`
269	.previous
270
271	vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
272	vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
273	vmovdqa L(case_add)(%rip), CASE_ADD_ymm
274	# endif
275	movl %edi, %eax
276	orl %esi, %eax
277	sall $`20`, %eax
278	/ Check if s1 or s2 may cross a page in next 4x VEC loads. /
279	cmpl $((PAGE_SIZE -(VEC_SIZE * `4`)) << `20`), %eax
280	ja L(page_cross)
281
282	L(no_page_cross):
283	/ Safe to compare 4x vectors. /
284	VMOVU (%rdi), %ymm0
285	/ 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.*
286	Otherwise converts ymm0 and load from rsi to lower. ymm2 is
287	scratch and ymm1 is the return. /*
288	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
289	/ 1s at null CHAR. /
290	VPCMPEQ %ymm0, %ymmZERO, %ymm2
291	/ 1s where s1 and s2 equal AND not null CHAR. /
292	vpandn %ymm1, %ymm2, %ymm1
293
294	/ All 1s -> keep going, any 0s -> return. /
295	vpmovmskb %ymm1, %ecx
296	# ifdef USE_AS_STRNCMP
297	cmpq $VEC_SIZE, %rdx
298	jbe L(vec_0_test_len)
299	# endif
300
301	/ All 1s represents all equals. incl will overflow to zero in*
302	all equals case. Otherwise 1s will carry until position of first
303	mismatch. /*
304	incl %ecx
305	jz L(more_3x_vec)
306
307	.p2align `4`,, `4`
308	L(return_vec_0):
309	tzcntl %ecx, %ecx
310	# ifdef USE_AS_WCSCMP
311	movl (%rdi, %rcx), %edx
312	xorl %eax, %eax
313	cmpl (%rsi, %rcx), %edx
314	je L(ret0)
315	setl %al
316	negl %eax
317	orl $`1`, %eax
318	# else
319	movzbl (%rdi, %rcx), %eax
320	movzbl (%rsi, %rcx), %ecx
321	TOLOWER_gpr (%rax, %eax)
322	TOLOWER_gpr (%rcx, %ecx)
323	subl %ecx, %eax
324	# endif
325	L(ret0):
326	L(return_vzeroupper):
327	ZERO_UPPER_VEC_REGISTERS_RETURN
328
329	# ifdef USE_AS_STRNCMP
330	.p2align `4`,, `8`
331	L(vec_0_test_len):
332	notl %ecx
333	bzhil %edx, %ecx, %eax
334	jnz L(return_vec_0)
335	/ Align if will cross fetch block. /
336	.p2align `4`,, `2`
337	L(ret_zero):
338	xorl %eax, %eax
339	VZEROUPPER_RETURN
340
341	.p2align `4`,, `5`
342	L(one_or_less):
343	# ifdef USE_AS_STRCASECMP_L
344	/ Set locale argument for strcasecmp. /
345	movq %LOCALE_REG, %rdx
346	# endif
347	jb L(ret_zero)
348	/ 'nbe' covers the case where length is negative (large*
349	unsigned). /*
350	jnbe OVERFLOW_STRCMP
351	# ifdef USE_AS_WCSCMP
352	movl (%rdi), %edx
353	xorl %eax, %eax
354	cmpl (%rsi), %edx
355	je L(ret1)
356	setl %al
357	negl %eax
358	orl $`1`, %eax
359	# else
360	movzbl (%rdi), %eax
361	movzbl (%rsi), %ecx
362	TOLOWER_gpr (%rax, %eax)
363	TOLOWER_gpr (%rcx, %ecx)
364	subl %ecx, %eax
365	# endif
366	L(ret1):
367	ret
368	# endif
369
370	.p2align `4`,, `10`
371	L(return_vec_1):
372	tzcntl %ecx, %ecx
373	# ifdef USE_AS_STRNCMP
374	/ rdx must be > CHAR_PER_VEC so save to subtract w.o fear of*
375	overflow. /*
376	addq $-VEC_SIZE, %rdx
377	cmpq %rcx, %rdx
378	jbe L(ret_zero)
379	# endif
380	# ifdef USE_AS_WCSCMP
381	movl VEC_SIZE(%rdi, %rcx), %edx
382	xorl %eax, %eax
383	cmpl VEC_SIZE(%rsi, %rcx), %edx
384	je L(ret2)
385	setl %al
386	negl %eax
387	orl $`1`, %eax
388	# else
389	movzbl VEC_SIZE(%rdi, %rcx), %eax
390	movzbl VEC_SIZE(%rsi, %rcx), %ecx
391	TOLOWER_gpr (%rax, %eax)
392	TOLOWER_gpr (%rcx, %ecx)
393	subl %ecx, %eax
394	# endif
395	L(ret2):
396	VZEROUPPER_RETURN
397
398	.p2align `4`,, `10`
399	# ifdef USE_AS_STRNCMP
400	L(return_vec_3):
401	salq $`32`, %rcx
402	# endif
403
404	L(return_vec_2):
405	# ifndef USE_AS_STRNCMP
406	tzcntl %ecx, %ecx
407	# else
408	tzcntq %rcx, %rcx
409	cmpq %rcx, %rdx
410	jbe L(ret_zero)
411	# endif
412
413	# ifdef USE_AS_WCSCMP
414	movl (VEC_SIZE * `2`)(%rdi, %rcx), %edx
415	xorl %eax, %eax
416	cmpl (VEC_SIZE * `2`)(%rsi, %rcx), %edx
417	je L(ret3)
418	setl %al
419	negl %eax
420	orl $`1`, %eax
421	# else
422	movzbl (VEC_SIZE * `2`)(%rdi, %rcx), %eax
423	movzbl (VEC_SIZE * `2`)(%rsi, %rcx), %ecx
424	TOLOWER_gpr (%rax, %eax)
425	TOLOWER_gpr (%rcx, %ecx)
426	subl %ecx, %eax
427	# endif
428	L(ret3):
429	VZEROUPPER_RETURN
430
431	# ifndef USE_AS_STRNCMP
432	.p2align `4`,, `10`
433	L(return_vec_3):
434	tzcntl %ecx, %ecx
435	# ifdef USE_AS_WCSCMP
436	movl (VEC_SIZE * `3`)(%rdi, %rcx), %edx
437	xorl %eax, %eax
438	cmpl (VEC_SIZE * `3`)(%rsi, %rcx), %edx
439	je L(ret4)
440	setl %al
441	negl %eax
442	orl $`1`, %eax
443	# else
444	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
445	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %ecx
446	TOLOWER_gpr (%rax, %eax)
447	TOLOWER_gpr (%rcx, %ecx)
448	subl %ecx, %eax
449	# endif
450	L(ret4):
451	VZEROUPPER_RETURN
452	# endif
453
454	.p2align `4`,, `10`
455	L(more_3x_vec):
456	/ Safe to compare 4x vectors. /
457	VMOVU VEC_SIZE(%rdi), %ymm0
458	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
459	VPCMPEQ %ymm0, %ymmZERO, %ymm2
460	vpandn %ymm1, %ymm2, %ymm1
461	vpmovmskb %ymm1, %ecx
462	incl %ecx
463	jnz L(return_vec_1)
464
465	# ifdef USE_AS_STRNCMP
466	subq $(VEC_SIZE * `2`), %rdx
467	jbe L(ret_zero)
468	# endif
469
470	VMOVU (VEC_SIZE * `2`)(%rdi), %ymm0
471	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * `2`)(%rsi), %ymm2, %ymm1)
472	VPCMPEQ %ymm0, %ymmZERO, %ymm2
473	vpandn %ymm1, %ymm2, %ymm1
474	vpmovmskb %ymm1, %ecx
475	incl %ecx
476	jnz L(return_vec_2)
477
478	VMOVU (VEC_SIZE * `3`)(%rdi), %ymm0
479	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * `3`)(%rsi), %ymm2, %ymm1)
480	VPCMPEQ %ymm0, %ymmZERO, %ymm2
481	vpandn %ymm1, %ymm2, %ymm1
482	vpmovmskb %ymm1, %ecx
483	incl %ecx
484	jnz L(return_vec_3)
485
486	# ifdef USE_AS_STRNCMP
487	cmpq $(VEC_SIZE * `2`), %rdx
488	jbe L(ret_zero)
489	# endif
490
491	# ifdef USE_AS_WCSCMP
492	/ any non-zero positive value that doesn't inference with 0x1.*
493	*/
494	movl $`2`, %r8d
495
496	# else
497	xorl %r8d, %r8d
498	# endif
499
500	/ The prepare labels are various entry points from the page*
501	cross logic. /*
502	L(prepare_loop):
503
504	# ifdef USE_AS_STRNCMP
505	/ Store N + (VEC_SIZE * 4) and place check at the begining of*
506	the loop. /*
507	leaq (VEC_SIZE * `2`)(%rdi, %rdx), %rdx
508	# endif
509	L(prepare_loop_no_len):
510
511	/ Align s1 and adjust s2 accordingly. /
512	subq %rdi, %rsi
513	andq $-(VEC_SIZE * `4`), %rdi
514	addq %rdi, %rsi
515
516	# ifdef USE_AS_STRNCMP
517	subq %rdi, %rdx
518	# endif
519
520	L(prepare_loop_aligned):
521	/ eax stores distance from rsi to next page cross. These cases*
522	need to be handled specially as the 4x loop could potentially
523	read memory past the length of s1 or s2 and across a page
524	boundary. /*
525	movl $-(VEC_SIZE * `4`), %eax
526	subl %esi, %eax
527	andl $(PAGE_SIZE - `1`), %eax
528
529	/ Loop 4x comparisons at a time. /
530	.p2align `4`
531	L(loop):
532
533	/ End condition for strncmp. /
534	# ifdef USE_AS_STRNCMP
535	subq $(VEC_SIZE * `4`), %rdx
536	jbe L(ret_zero)
537	# endif
538
539	subq $-(VEC_SIZE * `4`), %rdi
540	subq $-(VEC_SIZE * `4`), %rsi
541
542	/ Check if rsi loads will cross a page boundary. /
543	addl $-(VEC_SIZE * `4`), %eax
544	jnb L(page_cross_during_loop)
545
546	/ Loop entry after handling page cross during loop. /
547	L(loop_skip_page_cross_check):
548	VMOVA (VEC_SIZE * `0`)(%rdi), %ymm0
549	VMOVA (VEC_SIZE * `1`)(%rdi), %ymm2
550	VMOVA (VEC_SIZE * `2`)(%rdi), %ymm4
551	VMOVA (VEC_SIZE * `3`)(%rdi), %ymm6
552
553	/ ymm1 all 1s where s1 and s2 equal. All 0s otherwise. /
554	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * `0`)(%rsi), %ymm3, %ymm1)
555	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * `1`)(%rsi), %ymm5, %ymm3)
556	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * `2`)(%rsi), %ymm7, %ymm5)
557	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * `3`)(%rsi), %ymm13, %ymm7)
558
559	/ If any mismatches or null CHAR then 0 CHAR, otherwise non-*
560	zero. /*
561	vpand %ymm0, %ymm1, %ymm1
562
563
564	vpand %ymm2, %ymm3, %ymm3
565	vpand %ymm4, %ymm5, %ymm5
566	vpand %ymm6, %ymm7, %ymm7
567
568	VPMINU %ymm1, %ymm3, %ymm3
569	VPMINU %ymm5, %ymm7, %ymm7
570
571	/ Reduce all 0 CHARs for the 4x VEC into ymm7. /
572	VPMINU %ymm3, %ymm7, %ymm7
573
574	/ If any 0 CHAR then done. /
575	VPCMPEQ %ymm7, %ymmZERO, %ymm7
576	vpmovmskb %ymm7, %LOOP_REG
577	testl %LOOP_REG, %LOOP_REG
578	jz L(loop)
579
580	/ Find which VEC has the mismatch of end of string. /
581	VPCMPEQ %ymm1, %ymmZERO, %ymm1
582	vpmovmskb %ymm1, %ecx
583	testl %ecx, %ecx
584	jnz L(return_vec_0_end)
585
586
587	VPCMPEQ %ymm3, %ymmZERO, %ymm3
588	vpmovmskb %ymm3, %ecx
589	testl %ecx, %ecx
590	jnz L(return_vec_1_end)
591
592	L(return_vec_2_3_end):
593	# ifdef USE_AS_STRNCMP
594	subq $(VEC_SIZE * `2`), %rdx
595	jbe L(ret_zero_end)
596	# endif
597
598	VPCMPEQ %ymm5, %ymmZERO, %ymm5
599	vpmovmskb %ymm5, %ecx
600	testl %ecx, %ecx
601	jnz L(return_vec_2_end)
602
603	/ LOOP_REG contains matches for null/mismatch from the loop. If*
604	VEC 0,1,and 2 all have no null and no mismatches then mismatch
605	must entirely be from VEC 3 which is fully represented by
606	LOOP_REG. /*
607	tzcntl %LOOP_REG, %LOOP_REG
608
609	# ifdef USE_AS_STRNCMP
610	subl $-(VEC_SIZE), %LOOP_REG
611	cmpq %LOOP_REG64, %rdx
612	jbe L(ret_zero_end)
613	# endif
614
615	# ifdef USE_AS_WCSCMP
616	movl (VEC_SIZE * `2` - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
617	xorl %eax, %eax
618	cmpl (VEC_SIZE * `2` - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
619	je L(ret5)
620	setl %al
621	negl %eax
622	xorl %r8d, %eax
623	# else
624	movzbl (VEC_SIZE * `2` - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
625	movzbl (VEC_SIZE * `2` - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
626	TOLOWER_gpr (%rax, %eax)
627	TOLOWER_gpr (%rcx, %ecx)
628	subl %ecx, %eax
629	xorl %r8d, %eax
630	subl %r8d, %eax
631	# endif
632	L(ret5):
633	VZEROUPPER_RETURN
634
635	# ifdef USE_AS_STRNCMP
636	.p2align `4`,, `2`
637	L(ret_zero_end):
638	xorl %eax, %eax
639	VZEROUPPER_RETURN
640	# endif
641
642
643	/ The L(return_vec_N_end) differ from L(return_vec_N) in that*
644	they use the value of `r8` to negate the return value. This is
645	because the page cross logic can swap `rdi` and `rsi`. /*
646	.p2align `4`,, `10`
647	# ifdef USE_AS_STRNCMP
648	L(return_vec_1_end):
649	salq $`32`, %rcx
650	# endif
651	L(return_vec_0_end):
652	# ifndef USE_AS_STRNCMP
653	tzcntl %ecx, %ecx
654	# else
655	tzcntq %rcx, %rcx
656	cmpq %rcx, %rdx
657	jbe L(ret_zero_end)
658	# endif
659
660	# ifdef USE_AS_WCSCMP
661	movl (%rdi, %rcx), %edx
662	xorl %eax, %eax
663	cmpl (%rsi, %rcx), %edx
664	je L(ret6)
665	setl %al
666	negl %eax
667	xorl %r8d, %eax
668	# else
669	movzbl (%rdi, %rcx), %eax
670	movzbl (%rsi, %rcx), %ecx
671	TOLOWER_gpr (%rax, %eax)
672	TOLOWER_gpr (%rcx, %ecx)
673	subl %ecx, %eax
674	xorl %r8d, %eax
675	subl %r8d, %eax
676	# endif
677	L(ret6):
678	VZEROUPPER_RETURN
679
680	# ifndef USE_AS_STRNCMP
681	.p2align `4`,, `10`
682	L(return_vec_1_end):
683	tzcntl %ecx, %ecx
684	# ifdef USE_AS_WCSCMP
685	movl VEC_SIZE(%rdi, %rcx), %edx
686	xorl %eax, %eax
687	cmpl VEC_SIZE(%rsi, %rcx), %edx
688	je L(ret7)
689	setl %al
690	negl %eax
691	xorl %r8d, %eax
692	# else
693	movzbl VEC_SIZE(%rdi, %rcx), %eax
694	movzbl VEC_SIZE(%rsi, %rcx), %ecx
695	TOLOWER_gpr (%rax, %eax)
696	TOLOWER_gpr (%rcx, %ecx)
697	subl %ecx, %eax
698	xorl %r8d, %eax
699	subl %r8d, %eax
700	# endif
701	L(ret7):
702	VZEROUPPER_RETURN
703	# endif
704
705	.p2align `4`,, `10`
706	L(return_vec_2_end):
707	tzcntl %ecx, %ecx
708	# ifdef USE_AS_STRNCMP
709	cmpq %rcx, %rdx
710	jbe L(ret_zero_page_cross)
711	# endif
712	# ifdef USE_AS_WCSCMP
713	movl (VEC_SIZE * `2`)(%rdi, %rcx), %edx
714	xorl %eax, %eax
715	cmpl (VEC_SIZE * `2`)(%rsi, %rcx), %edx
716	je L(ret11)
717	setl %al
718	negl %eax
719	xorl %r8d, %eax
720	# else
721	movzbl (VEC_SIZE * `2`)(%rdi, %rcx), %eax
722	movzbl (VEC_SIZE * `2`)(%rsi, %rcx), %ecx
723	TOLOWER_gpr (%rax, %eax)
724	TOLOWER_gpr (%rcx, %ecx)
725	subl %ecx, %eax
726	xorl %r8d, %eax
727	subl %r8d, %eax
728	# endif
729	L(ret11):
730	VZEROUPPER_RETURN
731
732
733	/ Page cross in rsi in next 4x VEC. /
734
735	/ TODO: Improve logic here. /
736	.p2align `4`,, `10`
737	L(page_cross_during_loop):
738	/ eax contains [distance_from_page - (VEC_SIZE * 4)]. /
739
740	/ Optimistically rsi and rdi and both aligned inwhich case we*
741	don't need any logic here. /*
742	cmpl $-(VEC_SIZE * `4`), %eax
743	/ Don't adjust eax before jumping back to loop and we will*
744	never hit page cross case again. /*
745	je L(loop_skip_page_cross_check)
746
747	/ Check if we can safely load a VEC. /
748	cmpl $-(VEC_SIZE * `3`), %eax
749	jle L(less_1x_vec_till_page_cross)
750
751	VMOVA (%rdi), %ymm0
752	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
753	VPCMPEQ %ymm0, %ymmZERO, %ymm2
754	vpandn %ymm1, %ymm2, %ymm1
755	vpmovmskb %ymm1, %ecx
756	incl %ecx
757	jnz L(return_vec_0_end)
758
759	/ if distance >= 2x VEC then eax > -(VEC_SIZE * 2). /
760	cmpl $-(VEC_SIZE * `2`), %eax
761	jg L(more_2x_vec_till_page_cross)
762
763	.p2align `4`,, `4`
764	L(less_1x_vec_till_page_cross):
765	subl $-(VEC_SIZE * `4`), %eax
766	/ Guranteed safe to read from rdi - VEC_SIZE here. The only*
767	concerning case is first iteration if incoming s1 was near start
768	of a page and s2 near end. If s1 was near the start of the page
769	we already aligned up to nearest VEC_SIZE 4 so gurnateed safe*
770	to read back -VEC_SIZE. If rdi is truly at the start of a page
771	here, it means the previous page (rdi - VEC_SIZE) has already
772	been loaded earlier so must be valid. /*
773	VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
774	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
775	VPCMPEQ %ymm0, %ymmZERO, %ymm2
776	vpandn %ymm1, %ymm2, %ymm1
777	vpmovmskb %ymm1, %ecx
778
779	/ Mask of potentially valid bits. The lower bits can be out of*
780	range comparisons (but safe regarding page crosses). /*
781	movl $-`1`, %r10d
782	shlxl %esi, %r10d, %r10d
783	notl %ecx
784
785	# ifdef USE_AS_STRNCMP
786	cmpq %rax, %rdx
787	jbe L(return_page_cross_end_check)
788	# endif
789	movl %eax, %OFFSET_REG
790	addl $(PAGE_SIZE - VEC_SIZE * `4`), %eax
791
792	andl %r10d, %ecx
793	jz L(loop_skip_page_cross_check)
794
795	.p2align `4`,, `3`
796	L(return_page_cross_end):
797	tzcntl %ecx, %ecx
798
799	# ifdef USE_AS_STRNCMP
800	leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
801	L(return_page_cross_cmp_mem):
802	# else
803	addl %OFFSET_REG, %ecx
804	# endif
805	# ifdef USE_AS_WCSCMP
806	movl VEC_OFFSET(%rdi, %rcx), %edx
807	xorl %eax, %eax
808	cmpl VEC_OFFSET(%rsi, %rcx), %edx
809	je L(ret8)
810	setl %al
811	negl %eax
812	xorl %r8d, %eax
813	# else
814	movzbl VEC_OFFSET(%rdi, %rcx), %eax
815	movzbl VEC_OFFSET(%rsi, %rcx), %ecx
816	TOLOWER_gpr (%rax, %eax)
817	TOLOWER_gpr (%rcx, %ecx)
818	subl %ecx, %eax
819	xorl %r8d, %eax
820	subl %r8d, %eax
821	# endif
822	L(ret8):
823	VZEROUPPER_RETURN
824
825	# ifdef USE_AS_STRNCMP
826	.p2align `4`,, `10`
827	L(return_page_cross_end_check):
828	andl %r10d, %ecx
829	tzcntl %ecx, %ecx
830	leal -VEC_SIZE(%rax, %rcx), %ecx
831	cmpl %ecx, %edx
832	ja L(return_page_cross_cmp_mem)
833	xorl %eax, %eax
834	VZEROUPPER_RETURN
835	# endif
836
837
838	.p2align `4`,, `10`
839	L(more_2x_vec_till_page_cross):
840	/ If more 2x vec till cross we will complete a full loop*
841	iteration here. /*
842
843	VMOVU VEC_SIZE(%rdi), %ymm0
844	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
845	VPCMPEQ %ymm0, %ymmZERO, %ymm2
846	vpandn %ymm1, %ymm2, %ymm1
847	vpmovmskb %ymm1, %ecx
848	incl %ecx
849	jnz L(return_vec_1_end)
850
851	# ifdef USE_AS_STRNCMP
852	cmpq $(VEC_SIZE * `2`), %rdx
853	jbe L(ret_zero_in_loop_page_cross)
854	# endif
855
856	subl $-(VEC_SIZE * `4`), %eax
857
858	/ Safe to include comparisons from lower bytes. /
859	VMOVU -(VEC_SIZE * `2`)(%rdi, %rax), %ymm0
860	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * `2`)(%rsi, %rax), %ymm2, %ymm1)
861	VPCMPEQ %ymm0, %ymmZERO, %ymm2
862	vpandn %ymm1, %ymm2, %ymm1
863	vpmovmskb %ymm1, %ecx
864	incl %ecx
865	jnz L(return_vec_page_cross_0)
866
867	VMOVU -(VEC_SIZE * `1`)(%rdi, %rax), %ymm0
868	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * `1`)(%rsi, %rax), %ymm2, %ymm1)
869	VPCMPEQ %ymm0, %ymmZERO, %ymm2
870	vpandn %ymm1, %ymm2, %ymm1
871	vpmovmskb %ymm1, %ecx
872	incl %ecx
873	jnz L(return_vec_page_cross_1)
874
875	# ifdef USE_AS_STRNCMP
876	/ Must check length here as length might proclude reading next*
877	page. /*
878	cmpq %rax, %rdx
879	jbe L(ret_zero_in_loop_page_cross)
880	# endif
881
882	/ Finish the loop. /
883	VMOVA (VEC_SIZE * `2`)(%rdi), %ymm4
884	VMOVA (VEC_SIZE * `3`)(%rdi), %ymm6
885
886	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * `2`)(%rsi), %ymm7, %ymm5)
887	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * `3`)(%rsi), %ymm13, %ymm7)
888	vpand %ymm4, %ymm5, %ymm5
889	vpand %ymm6, %ymm7, %ymm7
890	VPMINU %ymm5, %ymm7, %ymm7
891	VPCMPEQ %ymm7, %ymmZERO, %ymm7
892	vpmovmskb %ymm7, %LOOP_REG
893	testl %LOOP_REG, %LOOP_REG
894	jnz L(return_vec_2_3_end)
895
896	/ Best for code size to include ucond-jmp here. Would be faster*
897	if this case is hot to duplicate the L(return_vec_2_3_end) code
898	as fall-through and have jump back to loop on mismatch
899	comparison. /*
900	subq $-(VEC_SIZE * `4`), %rdi
901	subq $-(VEC_SIZE * `4`), %rsi
902	addl $(PAGE_SIZE - VEC_SIZE * `8`), %eax
903	# ifdef USE_AS_STRNCMP
904	subq $(VEC_SIZE * `4`), %rdx
905	ja L(loop_skip_page_cross_check)
906	L(ret_zero_in_loop_page_cross):
907	xorl %eax, %eax
908	VZEROUPPER_RETURN
909	# else
910	jmp L(loop_skip_page_cross_check)
911	# endif
912
913
914	.p2align `4`,, `10`
915	L(return_vec_page_cross_0):
916	addl $-VEC_SIZE, %eax
917	L(return_vec_page_cross_1):
918	tzcntl %ecx, %ecx
919	# ifdef USE_AS_STRNCMP
920	leal -VEC_SIZE(%rax, %rcx), %ecx
921	cmpq %rcx, %rdx
922	jbe L(ret_zero_in_loop_page_cross)
923	# else
924	addl %eax, %ecx
925	# endif
926
927	# ifdef USE_AS_WCSCMP
928	movl VEC_OFFSET(%rdi, %rcx), %edx
929	xorl %eax, %eax
930	cmpl VEC_OFFSET(%rsi, %rcx), %edx
931	je L(ret9)
932	setl %al
933	negl %eax
934	xorl %r8d, %eax
935	# else
936	movzbl VEC_OFFSET(%rdi, %rcx), %eax
937	movzbl VEC_OFFSET(%rsi, %rcx), %ecx
938	TOLOWER_gpr (%rax, %eax)
939	TOLOWER_gpr (%rcx, %ecx)
940	subl %ecx, %eax
941	xorl %r8d, %eax
942	subl %r8d, %eax
943	# endif
944	L(ret9):
945	VZEROUPPER_RETURN
946
947
948	.p2align `4`,, `10`
949	L(page_cross):
950	# ifndef USE_AS_STRNCMP
951	/ If both are VEC aligned we don't need any special logic here.*
952	Only valid for strcmp where stop condition is guranteed to be
953	reachable by just reading memory. /*
954	testl $((VEC_SIZE - `1`) << `20`), %eax
955	jz L(no_page_cross)
956	# endif
957
958	movl %edi, %eax
959	movl %esi, %ecx
960	andl $(PAGE_SIZE - `1`), %eax
961	andl $(PAGE_SIZE - `1`), %ecx
962
963	xorl %OFFSET_REG, %OFFSET_REG
964
965	/ Check which is closer to page cross, s1 or s2. /
966	cmpl %eax, %ecx
967	jg L(page_cross_s2)
968
969	/ The previous page cross check has false positives. Check for*
970	true positive as page cross logic is very expensive. /*
971	subl $(PAGE_SIZE - VEC_SIZE * `4`), %eax
972	jbe L(no_page_cross)
973
974	/ Set r8 to not interfere with normal return value (rdi and rsi*
975	did not swap). /*
976	# ifdef USE_AS_WCSCMP
977	/ any non-zero positive value that doesn't inference with 0x1.*
978	*/
979	movl $`2`, %r8d
980	# else
981	xorl %r8d, %r8d
982	# endif
983
984	/ Check if less than 1x VEC till page cross. /
985	subl $(VEC_SIZE * `3`), %eax
986	jg L(less_1x_vec_till_page)
987
988	/ If more than 1x VEC till page cross, loop throuh safely*
989	loadable memory until within 1x VEC of page cross. /*
990
991	.p2align `4`,, `10`
992	L(page_cross_loop):
993
994	VMOVU (%rdi, %OFFSET_REG64), %ymm0
995	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
996	VPCMPEQ %ymm0, %ymmZERO, %ymm2
997	vpandn %ymm1, %ymm2, %ymm1
998	vpmovmskb %ymm1, %ecx
999	incl %ecx
1000
1001	jnz L(check_ret_vec_page_cross)
1002	addl $VEC_SIZE, %OFFSET_REG
1003	# ifdef USE_AS_STRNCMP
1004	cmpq %OFFSET_REG64, %rdx
1005	jbe L(ret_zero_page_cross)
1006	# endif
1007	addl $VEC_SIZE, %eax
1008	jl L(page_cross_loop)
1009
1010	subl %eax, %OFFSET_REG
1011	/ OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed*
1012	to not cross page so is safe to load. Since we have already
1013	loaded at least 1 VEC from rsi it is also guranteed to be
1014	safe. /*
1015
1016	VMOVU (%rdi, %OFFSET_REG64), %ymm0
1017	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
1018	VPCMPEQ %ymm0, %ymmZERO, %ymm2
1019	vpandn %ymm1, %ymm2, %ymm1
1020	vpmovmskb %ymm1, %ecx
1021
1022	# ifdef USE_AS_STRNCMP
1023	leal VEC_SIZE(%OFFSET_REG64), %eax
1024	cmpq %rax, %rdx
1025	jbe L(check_ret_vec_page_cross2)
1026	addq %rdi, %rdx
1027	# endif
1028	incl %ecx
1029	jz L(prepare_loop_no_len)
1030
1031	.p2align `4`,, `4`
1032	L(ret_vec_page_cross):
1033	# ifndef USE_AS_STRNCMP
1034	L(check_ret_vec_page_cross):
1035	# endif
1036	tzcntl %ecx, %ecx
1037	addl %OFFSET_REG, %ecx
1038	L(ret_vec_page_cross_cont):
1039	# ifdef USE_AS_WCSCMP
1040	movl (%rdi, %rcx), %edx
1041	xorl %eax, %eax
1042	cmpl (%rsi, %rcx), %edx
1043	je L(ret12)
1044	setl %al
1045	negl %eax
1046	xorl %r8d, %eax
1047	# else
1048	movzbl (%rdi, %rcx), %eax
1049	movzbl (%rsi, %rcx), %ecx
1050	TOLOWER_gpr (%rax, %eax)
1051	TOLOWER_gpr (%rcx, %ecx)
1052	subl %ecx, %eax
1053	xorl %r8d, %eax
1054	subl %r8d, %eax
1055	# endif
1056	L(ret12):
1057	VZEROUPPER_RETURN
1058
1059	# ifdef USE_AS_STRNCMP
1060	.p2align `4`,, `10`
1061	L(check_ret_vec_page_cross2):
1062	incl %ecx
1063	L(check_ret_vec_page_cross):
1064	tzcntl %ecx, %ecx
1065	addl %OFFSET_REG, %ecx
1066	cmpq %rcx, %rdx
1067	ja L(ret_vec_page_cross_cont)
1068	.p2align `4`,, `2`
1069	L(ret_zero_page_cross):
1070	xorl %eax, %eax
1071	VZEROUPPER_RETURN
1072	# endif
1073
1074	.p2align `4`,, `4`
1075	L(page_cross_s2):
1076	/ Ensure this is a true page cross. /
1077	subl $(PAGE_SIZE - VEC_SIZE * `4`), %ecx
1078	jbe L(no_page_cross)
1079
1080
1081	movl %ecx, %eax
1082	movq %rdi, %rcx
1083	movq %rsi, %rdi
1084	movq %rcx, %rsi
1085
1086	/ set r8 to negate return value as rdi and rsi swapped. /
1087	# ifdef USE_AS_WCSCMP
1088	movl $-`4`, %r8d
1089	# else
1090	movl $-`1`, %r8d
1091	# endif
1092	xorl %OFFSET_REG, %OFFSET_REG
1093
1094	/ Check if more than 1x VEC till page cross. /
1095	subl $(VEC_SIZE * `3`), %eax
1096	jle L(page_cross_loop)
1097
1098	.p2align `4`,, `6`
1099	L(less_1x_vec_till_page):
1100	/ Find largest load size we can use. /
1101	cmpl $`16`, %eax
1102	ja L(less_16_till_page)
1103
1104	VMOVU (%rdi), %xmm0
1105	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
1106	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1107	vpandn %xmm1, %xmm2, %xmm1
1108	vpmovmskb %ymm1, %ecx
1109	incw %cx
1110	jnz L(check_ret_vec_page_cross)
1111	movl $`16`, %OFFSET_REG
1112	# ifdef USE_AS_STRNCMP
1113	cmpq %OFFSET_REG64, %rdx
1114	jbe L(ret_zero_page_cross_slow_case0)
1115	subl %eax, %OFFSET_REG
1116	# else
1117	/ Explicit check for 16 byte alignment. /
1118	subl %eax, %OFFSET_REG
1119	jz L(prepare_loop)
1120	# endif
1121
1122	VMOVU (%rdi, %OFFSET_REG64), %xmm0
1123	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
1124	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1125	vpandn %xmm1, %xmm2, %xmm1
1126	vpmovmskb %ymm1, %ecx
1127	incw %cx
1128	jnz L(check_ret_vec_page_cross)
1129
1130	# ifdef USE_AS_STRNCMP
1131	addl $`16`, %OFFSET_REG
1132	subq %OFFSET_REG64, %rdx
1133	jbe L(ret_zero_page_cross_slow_case0)
1134	subq $-(VEC_SIZE * `4`), %rdx
1135
1136	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1137	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1138	# else
1139	leaq (`16` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1140	leaq (`16` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1141	# endif
1142	jmp L(prepare_loop_aligned)
1143
1144	# ifdef USE_AS_STRNCMP
1145	.p2align `4`,, `2`
1146	L(ret_zero_page_cross_slow_case0):
1147	xorl %eax, %eax
1148	ret
1149	# endif
1150
1151
1152	.p2align `4`,, `10`
1153	L(less_16_till_page):
1154	/ Find largest load size we can use. /
1155	cmpl $`24`, %eax
1156	ja L(less_8_till_page)
1157
1158	vmovq (%rdi), %xmm0
1159	vmovq (%rsi), %xmm1
1160	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1161	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1162	vpandn %xmm1, %xmm2, %xmm1
1163	vpmovmskb %ymm1, %ecx
1164	incb %cl
1165	jnz L(check_ret_vec_page_cross)
1166
1167
1168	# ifdef USE_AS_STRNCMP
1169	cmpq $`8`, %rdx
1170	jbe L(ret_zero_page_cross_slow_case0)
1171	# endif
1172	movl $`24`, %OFFSET_REG
1173	/ Explicit check for 16 byte alignment. /
1174	subl %eax, %OFFSET_REG
1175
1176
1177
1178	vmovq (%rdi, %OFFSET_REG64), %xmm0
1179	vmovq (%rsi, %OFFSET_REG64), %xmm1
1180	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1181	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1182	vpandn %xmm1, %xmm2, %xmm1
1183	vpmovmskb %ymm1, %ecx
1184	incb %cl
1185	jnz L(check_ret_vec_page_cross)
1186
1187	# ifdef USE_AS_STRNCMP
1188	addl $`8`, %OFFSET_REG
1189	subq %OFFSET_REG64, %rdx
1190	jbe L(ret_zero_page_cross_slow_case0)
1191	subq $-(VEC_SIZE * `4`), %rdx
1192
1193	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1194	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1195	# else
1196	leaq (`8` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1197	leaq (`8` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1198	# endif
1199	jmp L(prepare_loop_aligned)
1200
1201
1202	.p2align `4`,, `10`
1203	L(less_8_till_page):
1204	# ifdef USE_AS_WCSCMP
1205	/ If using wchar then this is the only check before we reach*
1206	the page boundary. /*
1207	movl (%rdi), %eax
1208	movl (%rsi), %ecx
1209	cmpl %ecx, %eax
1210	jnz L(ret_less_8_wcs)
1211	# ifdef USE_AS_STRNCMP
1212	addq %rdi, %rdx
1213	/ We already checked for len <= 1 so cannot hit that case here.*
1214	*/
1215	# endif
1216	testl %eax, %eax
1217	jnz L(prepare_loop_no_len)
1218	ret
1219
1220	.p2align `4`,, `8`
1221	L(ret_less_8_wcs):
1222	setl %OFFSET_REG8
1223	negl %OFFSET_REG
1224	movl %OFFSET_REG, %eax
1225	xorl %r8d, %eax
1226	ret
1227
1228	# else
1229
1230	/ Find largest load size we can use. /
1231	cmpl $`28`, %eax
1232	ja L(less_4_till_page)
1233
1234	vmovd (%rdi), %xmm0
1235	vmovd (%rsi), %xmm1
1236	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1237	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1238	vpandn %xmm1, %xmm2, %xmm1
1239	vpmovmskb %ymm1, %ecx
1240	subl $`0xf`, %ecx
1241	jnz L(check_ret_vec_page_cross)
1242
1243	# ifdef USE_AS_STRNCMP
1244	cmpq $`4`, %rdx
1245	jbe L(ret_zero_page_cross_slow_case1)
1246	# endif
1247	movl $`28`, %OFFSET_REG
1248	/ Explicit check for 16 byte alignment. /
1249	subl %eax, %OFFSET_REG
1250
1251
1252
1253	vmovd (%rdi, %OFFSET_REG64), %xmm0
1254	vmovd (%rsi, %OFFSET_REG64), %xmm1
1255	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1256	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1257	vpandn %xmm1, %xmm2, %xmm1
1258	vpmovmskb %ymm1, %ecx
1259	subl $`0xf`, %ecx
1260	jnz L(check_ret_vec_page_cross)
1261
1262	# ifdef USE_AS_STRNCMP
1263	addl $`4`, %OFFSET_REG
1264	subq %OFFSET_REG64, %rdx
1265	jbe L(ret_zero_page_cross_slow_case1)
1266	subq $-(VEC_SIZE * `4`), %rdx
1267
1268	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1269	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1270	# else
1271	leaq (`4` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1272	leaq (`4` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1273	# endif
1274	jmp L(prepare_loop_aligned)
1275
1276	# ifdef USE_AS_STRNCMP
1277	.p2align `4`,, `2`
1278	L(ret_zero_page_cross_slow_case1):
1279	xorl %eax, %eax
1280	ret
1281	# endif
1282
1283	.p2align `4`,, `10`
1284	L(less_4_till_page):
1285	subq %rdi, %rsi
1286	/ Extremely slow byte comparison loop. /
1287	L(less_4_loop):
1288	movzbl (%rdi), %eax
1289	movzbl (%rsi, %rdi), %ecx
1290	TOLOWER_gpr (%rax, %eax)
1291	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1292	subl %BYTE_LOOP_REG, %eax
1293	jnz L(ret_less_4_loop)
1294	testl %ecx, %ecx
1295	jz L(ret_zero_4_loop)
1296	# ifdef USE_AS_STRNCMP
1297	decq %rdx
1298	jz L(ret_zero_4_loop)
1299	# endif
1300	incq %rdi
1301	/ end condition is reach page boundary (rdi is aligned). /
1302	testl $`31`, %edi
1303	jnz L(less_4_loop)
1304	leaq -(VEC_SIZE * `4`)(%rdi, %rsi), %rsi
1305	addq $-(VEC_SIZE * `4`), %rdi
1306	# ifdef USE_AS_STRNCMP
1307	subq $-(VEC_SIZE * `4`), %rdx
1308	# endif
1309	jmp L(prepare_loop_aligned)
1310
1311	L(ret_zero_4_loop):
1312	xorl %eax, %eax
1313	ret
1314	L(ret_less_4_loop):
1315	xorl %r8d, %eax
1316	subl %r8d, %eax
1317	ret
1318	# endif
1319	cfi_endproc
1320	.size STRCMP, .-STRCMP
1321	#endif
1322

source code of glibc/sysdeps/x86_64/multiarch/strcmp-avx2.S