memcmpeq-avx2.S source code [glibc/sysdeps/x86_64/multiarch/memcmpeq-avx2.S]

1	/ __memcmpeq optimized with AVX2.*
2	Copyright (C) 2017-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (3)
22
23	/ __memcmpeq is implemented as:*
24	1. Use ymm vector compares when possible. The only case where
25	vector compares is not possible for when size < VEC_SIZE
26	and loading from either s1 or s2 would cause a page cross.
27	2. Use xmm vector compare when size >= 8 bytes.
28	3. Optimistically compare up to first 4 VEC_SIZE one at a*
29	to check for early mismatches. Only do this if its guaranteed the
30	work is not wasted.
31	4. If size is 8 VEC_SIZE or less, unroll the loop.*
32	5. Compare 4 VEC_SIZE at a time with the aligned first memory*
33	area.
34	6. Use 2 vector compares when size is 2 VEC_SIZE or less.*
35	7. Use 4 vector compares when size is 4 VEC_SIZE or less.*
36	8. Use 8 vector compares when size is 8 VEC_SIZE or less. /
37
38	# include <sysdep.h>
39
40	# ifndef MEMCMPEQ
41	# define MEMCMPEQ __memcmpeq_avx2
42	# endif
43
44	# define VPCMPEQ vpcmpeqb
45
46	# ifndef VZEROUPPER
47	# define VZEROUPPER vzeroupper
48	# endif
49
50	# ifndef SECTION
51	# define SECTION(p) p##.avx
52	# endif
53
54	# define VEC_SIZE 32
55	# define PAGE_SIZE 4096
56
57	.section SECTION(.text), "ax", @progbits
58	ENTRY_P2ALIGN (MEMCMPEQ, `6`)
59	# ifdef __ILP32__
60	/ Clear the upper 32 bits. /
61	movl %edx, %edx
62	# endif
63	cmp $VEC_SIZE, %RDX_LP
64	jb L(less_vec)
65
66	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
67	vmovdqu (%rsi), %ymm1
68	VPCMPEQ (%rdi), %ymm1, %ymm1
69	vpmovmskb %ymm1, %eax
70	incl %eax
71	jnz L(return_neq0)
72	cmpq $(VEC_SIZE * `2`), %rdx
73	jbe L(last_1x_vec)
74
75	/ Check second VEC no matter what. /
76	vmovdqu VEC_SIZE(%rsi), %ymm2
77	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
78	vpmovmskb %ymm2, %eax
79	/ If all 4 VEC where equal eax will be all 1s so incl will overflow*
80	and set zero flag. /*
81	incl %eax
82	jnz L(return_neq0)
83
84	/ Less than 4 * VEC. /
85	cmpq $(VEC_SIZE * `4`), %rdx
86	jbe L(last_2x_vec)
87
88	/ Check third and fourth VEC no matter what. /
89	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
90	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
91	vpmovmskb %ymm3, %eax
92	incl %eax
93	jnz L(return_neq0)
94
95	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
96	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
97	vpmovmskb %ymm4, %eax
98	incl %eax
99	jnz L(return_neq0)
100
101	/ Go to 4x VEC loop. /
102	cmpq $(VEC_SIZE * `8`), %rdx
103	ja L(more_8x_vec)
104
105	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
106	branches. /*
107
108	/ Adjust rsi and rdi to avoid indexed address mode. This end up*
109	saving a 16 bytes of code, prevents unlamination, and bottlenecks in
110	the AGU. /*
111	addq %rdx, %rsi
112	vmovdqu -(VEC_SIZE * `4`)(%rsi), %ymm1
113	vmovdqu -(VEC_SIZE * `3`)(%rsi), %ymm2
114	addq %rdx, %rdi
115
116	VPCMPEQ -(VEC_SIZE * `4`)(%rdi), %ymm1, %ymm1
117	VPCMPEQ -(VEC_SIZE * `3`)(%rdi), %ymm2, %ymm2
118
119	vmovdqu -(VEC_SIZE * `2`)(%rsi), %ymm3
120	VPCMPEQ -(VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
121	vmovdqu -VEC_SIZE(%rsi), %ymm4
122	VPCMPEQ -VEC_SIZE(%rdi), %ymm4, %ymm4
123
124	/ Reduce VEC0 - VEC4. /
125	vpand %ymm1, %ymm2, %ymm2
126	vpand %ymm3, %ymm4, %ymm4
127	vpand %ymm2, %ymm4, %ymm4
128	vpmovmskb %ymm4, %eax
129	incl %eax
130	L(return_neq0):
131	L(return_vzeroupper):
132	ZERO_UPPER_VEC_REGISTERS_RETURN
133
134	/ NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 byte*
135	aligned. /*
136	.p2align `5`
137	L(less_vec):
138	/ Check if one or less char. This is necessary for size = 0 but is*
139	also faster for size = 1. /*
140	cmpl $`1`, %edx
141	jbe L(one_or_less)
142
143	/ Check if loading one VEC from either s1 or s2 could cause a page*
144	cross. This can have false positives but is by far the fastest
145	method. /*
146	movl %edi, %eax
147	orl %esi, %eax
148	andl $(PAGE_SIZE - `1`), %eax
149	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
150	jg L(page_cross_less_vec)
151
152	/ No page cross possible. /
153	vmovdqu (%rsi), %ymm2
154	VPCMPEQ (%rdi), %ymm2, %ymm2
155	vpmovmskb %ymm2, %eax
156	incl %eax
157	/ Result will be zero if s1 and s2 match. Otherwise first set bit*
158	will be first mismatch. /*
159	bzhil %edx, %eax, %eax
160	VZEROUPPER_RETURN
161
162	/ Relatively cold but placing close to L(less_vec) for 2 byte jump*
163	encoding. /*
164	.p2align `4`
165	L(one_or_less):
166	jb L(zero)
167	movzbl (%rsi), %ecx
168	movzbl (%rdi), %eax
169	subl %ecx, %eax
170	/ No ymm register was touched. /
171	ret
172	/ Within the same 16 byte block is L(one_or_less). /
173	L(zero):
174	xorl %eax, %eax
175	ret
176
177	.p2align `4`
178	L(last_1x_vec):
179	vmovdqu -(VEC_SIZE * `1`)(%rsi, %rdx), %ymm1
180	VPCMPEQ -(VEC_SIZE * `1`)(%rdi, %rdx), %ymm1, %ymm1
181	vpmovmskb %ymm1, %eax
182	incl %eax
183	VZEROUPPER_RETURN
184
185	.p2align `4`
186	L(last_2x_vec):
187	vmovdqu -(VEC_SIZE * `2`)(%rsi, %rdx), %ymm1
188	VPCMPEQ -(VEC_SIZE * `2`)(%rdi, %rdx), %ymm1, %ymm1
189	vmovdqu -(VEC_SIZE * `1`)(%rsi, %rdx), %ymm2
190	VPCMPEQ -(VEC_SIZE * `1`)(%rdi, %rdx), %ymm2, %ymm2
191	vpand %ymm1, %ymm2, %ymm2
192	vpmovmskb %ymm2, %eax
193	incl %eax
194	VZEROUPPER_RETURN
195
196	.p2align `4`
197	L(more_8x_vec):
198	/ Set end of s1 in rdx. /
199	leaq -(VEC_SIZE * `4`)(%rdi, %rdx), %rdx
200	/ rsi stores s2 - s1. This allows loop to only update one pointer.*
201	*/
202	subq %rdi, %rsi
203	/ Align s1 pointer. /
204	andq $-VEC_SIZE, %rdi
205	/ Adjust because first 4x vec where check already. /
206	subq $-(VEC_SIZE * `4`), %rdi
207	.p2align `4`
208	L(loop_4x_vec):
209	/ rsi has s2 - s1 so get correct address by adding s1 (in rdi). /
210	vmovdqu (%rsi, %rdi), %ymm1
211	VPCMPEQ (%rdi), %ymm1, %ymm1
212
213	vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
214	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
215
216	vmovdqu (VEC_SIZE * `2`)(%rsi, %rdi), %ymm3
217	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
218
219	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdi), %ymm4
220	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
221
222	vpand %ymm1, %ymm2, %ymm2
223	vpand %ymm3, %ymm4, %ymm4
224	vpand %ymm2, %ymm4, %ymm4
225	vpmovmskb %ymm4, %eax
226	incl %eax
227	jnz L(return_neq1)
228	subq $-(VEC_SIZE * `4`), %rdi
229	/ Check if s1 pointer at end. /
230	cmpq %rdx, %rdi
231	jb L(loop_4x_vec)
232
233	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdx), %ymm4
234	VPCMPEQ (VEC_SIZE * `3`)(%rdx), %ymm4, %ymm4
235	subq %rdx, %rdi
236	/ rdi has 4 * VEC_SIZE - remaining length. /
237	cmpl $(VEC_SIZE * `3`), %edi
238	jae L(`8x_last_1x_vec`)
239	/ Load regardless of branch. /
240	vmovdqu (VEC_SIZE * `2`)(%rsi, %rdx), %ymm3
241	VPCMPEQ (VEC_SIZE * `2`)(%rdx), %ymm3, %ymm3
242	cmpl $(VEC_SIZE * `2`), %edi
243	jae L(`8x_last_2x_vec`)
244	/ Check last 4 VEC. /
245	vmovdqu VEC_SIZE(%rsi, %rdx), %ymm1
246	VPCMPEQ VEC_SIZE(%rdx), %ymm1, %ymm1
247
248	vmovdqu (%rsi, %rdx), %ymm2
249	VPCMPEQ (%rdx), %ymm2, %ymm2
250
251	vpand %ymm3, %ymm4, %ymm4
252	vpand %ymm1, %ymm2, %ymm3
253	L(`8x_last_2x_vec`):
254	vpand %ymm3, %ymm4, %ymm4
255	L(`8x_last_1x_vec`):
256	vpmovmskb %ymm4, %eax
257	/ Restore s1 pointer to rdi. /
258	incl %eax
259	L(return_neq1):
260	VZEROUPPER_RETURN
261
262	/ Relatively cold case as page cross are unexpected. /
263	.p2align `4`
264	L(page_cross_less_vec):
265	cmpl $`16`, %edx
266	jae L(between_16_31)
267	cmpl $`8`, %edx
268	ja L(between_9_15)
269	cmpl $`4`, %edx
270	jb L(between_2_3)
271	/ From 4 to 8 bytes. No branch when size == 4. /
272	movl (%rdi), %eax
273	subl (%rsi), %eax
274	movl -`4`(%rdi, %rdx), %ecx
275	movl -`4`(%rsi, %rdx), %edi
276	subl %edi, %ecx
277	orl %ecx, %eax
278	ret
279
280	.p2align `4`,, `8`
281	L(between_16_31):
282	/ From 16 to 31 bytes. No branch when size == 16. /
283
284	/ Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.*
285	*/
286	vmovdqu (%rsi), %xmm1
287	vpcmpeqb (%rdi), %xmm1, %xmm1
288	vmovdqu -`16`(%rsi, %rdx), %xmm2
289	vpcmpeqb -`16`(%rdi, %rdx), %xmm2, %xmm2
290	vpand %xmm1, %xmm2, %xmm2
291	vpmovmskb %xmm2, %eax
292	notw %ax
293	/ No ymm register was touched. /
294	ret
295
296	.p2align `4`,, `8`
297	L(between_9_15):
298	/ From 9 to 15 bytes. /
299	movq (%rdi), %rax
300	subq (%rsi), %rax
301	movq -`8`(%rdi, %rdx), %rcx
302	movq -`8`(%rsi, %rdx), %rdi
303	subq %rdi, %rcx
304	orq %rcx, %rax
305	/ edx is guaranteed to be a non-zero int. /
306	cmovnz %edx, %eax
307	ret
308
309	/ Don't align. This is cold and aligning here will cause code*
310	to spill into next cache line. /*
311	L(between_2_3):
312	/ From 2 to 3 bytes. No branch when size == 2. /
313	movzwl (%rdi), %eax
314	movzwl (%rsi), %ecx
315	subl %ecx, %eax
316	movzbl -`1`(%rdi, %rdx), %ecx
317	/ All machines that support evex will insert a "merging uop"*
318	avoiding any serious partial register stalls. /*
319	subb -`1`(%rsi, %rdx), %cl
320	orl %ecx, %eax
321	/ No ymm register was touched. /
322	ret
323
324	/ 2 Bytes from next cache line. /
325	END (MEMCMPEQ)
326	#endif
327

source code of glibc/sysdeps/x86_64/multiarch/memcmpeq-avx2.S