memcmp-evex-movbe.S source code [glibc/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S]

1	/ memcmp/wmemcmp optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23
24	/ memcmp/wmemcmp is implemented as:*
25	1. Use ymm vector compares when possible. The only case where
26	vector compares is not possible for when size < CHAR_PER_VEC
27	and loading from either s1 or s2 would cause a page cross.
28	2. For size from 2 to 7 bytes on page cross, load as big endian
29	with movbe and bswap to avoid branches.
30	3. Use xmm vector compare when size >= 4 bytes for memcmp or
31	size >= 8 bytes for wmemcmp.
32	4. Optimistically compare up to first 4 CHAR_PER_VEC one at a*
33	to check for early mismatches. Only do this if its guaranteed the
34	work is not wasted.
35	5. If size is 8 VEC_SIZE or less, unroll the loop.*
36	6. Compare 4 VEC_SIZE at a time with the aligned first memory*
37	area.
38	7. Use 2 vector compares when size is 2 CHAR_PER_VEC or less.*
39	8. Use 4 vector compares when size is 4 CHAR_PER_VEC or less.*
40	9. Use 8 vector compares when size is 8 CHAR_PER_VEC or less.*
41
42	When possible the implementation tries to optimize for frontend in the
43	following ways:
44	Throughput:
45	1. All code sections that fit are able to run optimally out of the
46	LSD.
47	2. All code sections that fit are able to run optimally out of the
48	DSB
49	3. Basic blocks are contained in minimum number of fetch blocks
50	necessary.
51
52	Latency:
53	1. Logically connected basic blocks are put in the same
54	cache-line.
55	2. Logically connected basic blocks that do not fit in the same
56	cache-line are put in adjacent lines. This can get beneficial
57	L2 spatial prefetching and L1 next-line prefetching. /*
58
59	# include <sysdep.h>
60
61	# ifndef MEMCMP
62	# define MEMCMP __memcmp_evex_movbe
63	# endif
64
65	# ifndef VEC_SIZE
66	# include "x86-evex256-vecs.h"
67	# endif
68
69	# ifdef USE_AS_WMEMCMP
70	# define VMOVU_MASK vmovdqu32
71	# define CHAR_SIZE 4
72	# define VPCMP vpcmpd
73	# define VPCMPEQ vpcmpeqd
74	# define VPTEST vptestmd
75
76	# define USE_WIDE_CHAR
77	# else
78	# define VMOVU_MASK vmovdqu8
79	# define CHAR_SIZE 1
80	# define VPCMP vpcmpub
81	# define VPCMPEQ vpcmpeqb
82	# define VPTEST vptestmb
83	# endif
84
85	# include "reg-macros.h"
86
87	# define PAGE_SIZE 4096
88	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
89
90
91	/ Warning!*
92	wmemcmp has to use SIGNED comparison for elements.
93	memcmp has to use UNSIGNED comparison for elements.
94	*/
95
96	.section SECTION(.text), "ax", @progbits
97	/ Cache align memcmp entry. This allows for much more thorough*
98	frontend optimization. /*
99	ENTRY_P2ALIGN (MEMCMP, `6`)
100	# ifdef __ILP32__
101	/ Clear the upper 32 bits. /
102	movl %edx, %edx
103	# endif
104	cmp $CHAR_PER_VEC, %RDX_LP
105	/ Fall through for [0, VEC_SIZE] as its the hottest. /
106	ja L(more_1x_vec)
107
108	/ Create mask of bytes that are guaranteed to be valid because*
109	of length (edx). Using masked movs allows us to skip checks
110	for page crosses/zero size. /*
111	mov $-`1`, %VRAX
112	bzhi %VRDX, %VRAX, %VRAX
113	/ NB: A `jz` might be useful here. Page-faults that are*
114	invalidated by predicate execution (the evex mask) can be
115	very slow. The expectation is this is not the norm so and
116	"most" code will not regularly call 'memcmp' with length = 0
117	and memory that is not wired up. /*
118	KMOV %VRAX, %k2
119
120
121
122	/ Safe to load full ymm with mask. /
123	VMOVU_MASK (%rsi), %VMM(`2`){%k2}{z}
124	/ Slightly different method for VEC_SIZE == 64 to save a bit of*
125	code size. This allows us to fit L(return_vec_0) entirely in
126	the first cache line. /*
127	# if VEC_SIZE == 64
128	VPCMPEQ (%rdi), %VMM(`2`), %k1{%k2}
129	KMOV %k1, %VRCX
130	sub %VRCX, %VRAX
131	# else
132	VPCMP $`4`, (%rdi), %VMM(`2`), %k1{%k2}
133	KMOV %k1, %VRAX
134	test %VRAX, %VRAX
135	# endif
136	jnz L(return_vec_0)
137	ret
138
139	.p2align `4`,, `11`
140	L(return_vec_0):
141	bsf %VRAX, %VRAX
142	# ifdef USE_AS_WMEMCMP
143	movl (%rdi, %rax, CHAR_SIZE), %ecx
144	xorl %edx, %edx
145	cmpl (%rsi, %rax, CHAR_SIZE), %ecx
146	/ NB: no partial register stall here because xorl zero idiom*
147	above. /*
148	setg %dl
149	leal -`1`(%rdx, %rdx), %eax
150	# else
151	movzbl (%rsi, %rax), %ecx
152	# if VEC_SIZE == 64
153	movb (%rdi, %rax), %al
154	# else
155	movzbl (%rdi, %rax), %eax
156	# endif
157	subl %ecx, %eax
158	# endif
159	ret
160
161	.p2align `4`,, `11`
162	L(more_1x_vec):
163	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
164	VMOVU (%rsi), %VMM(`1`)
165	/ Use compare not equals to directly check for mismatch. /
166	VPCMP $`4`, (%rdi), %VMM(`1`), %k1
167	KMOV %k1, %VRAX
168	/ NB: eax must be destination register if going to*
169	L(return_vec_[0,2]). For L(return_vec_3) destination
170	register must be ecx. /*
171	test %VRAX, %VRAX
172	jnz L(return_vec_0)
173
174	cmpq $(CHAR_PER_VEC * `2`), %rdx
175	jbe L(last_1x_vec)
176
177	/ Check second VEC no matter what. /
178	VMOVU VEC_SIZE(%rsi), %VMM(`2`)
179	VPCMP $`4`, VEC_SIZE(%rdi), %VMM(`2`), %k1
180	KMOV %k1, %VRAX
181	test %VRAX, %VRAX
182	jnz L(return_vec_1)
183
184	/ Less than 4 * VEC. /
185	cmpq $(CHAR_PER_VEC * `4`), %rdx
186	jbe L(last_2x_vec)
187
188	/ Check third and fourth VEC no matter what. /
189	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`3`)
190	VPCMP $`4`, (VEC_SIZE * `2`)(%rdi), %VMM(`3`), %k1
191	KMOV %k1, %VRAX
192	test %VRAX, %VRAX
193	jnz L(return_vec_2)
194
195	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`4`)
196	VPCMP $`4`, (VEC_SIZE * `3`)(%rdi), %VMM(`4`), %k1
197	KMOV %k1, %VRCX
198	test %VRCX, %VRCX
199	jnz L(return_vec_3)
200
201	/ Go to 4x VEC loop. /
202	cmpq $(CHAR_PER_VEC * `8`), %rdx
203	ja L(more_8x_vec)
204
205	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
206	branches. /*
207
208	/ Load first two VEC from s2 before adjusting addresses. /
209	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx, CHAR_SIZE), %VMM(`1`)
210	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx, CHAR_SIZE), %VMM(`2`)
211	leaq -(`4` * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
212	leaq -(`4` * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
213
214	/ Wait to load from s1 until addressed adjust due to*
215	unlamination of microfusion with complex address mode. /*
216
217	/ vpxor will be all 0s if s1 and s2 are equal. Otherwise it*
218	will have some 1s. /*
219	vpxorq (%rdi), %VMM(`1`), %VMM(`1`)
220	vpxorq (VEC_SIZE)(%rdi), %VMM(`2`), %VMM(`2`)
221
222	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`3`)
223	vpxorq (VEC_SIZE * `2`)(%rdi), %VMM(`3`), %VMM(`3`)
224
225	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`4`)
226	/ Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while*
227	oring with VEC(1). Result is stored in VEC(4). /*
228	vpternlogd $`0xde`, (VEC_SIZE * `3`)(%rdi), %VMM(`1`), %VMM(`4`)
229
230	/ Or together VEC(2), VEC(3), and VEC(4) into VEC(4). /
231	vpternlogd $`0xfe`, %VMM(`2`), %VMM(`3`), %VMM(`4`)
232
233	/ Test VEC(4) against itself. Store any CHAR mismatches in k1.*
234	*/
235	VPTEST %VMM(`4`), %VMM(`4`), %k1
236	/ k1 must go to ecx for L(return_vec_0_1_2_3). /
237	KMOV %k1, %VRCX
238	test %VRCX, %VRCX
239	jnz L(return_vec_0_1_2_3)
240	/ NB: eax must be zero to reach here. /
241	ret
242
243
244	.p2align `4`,, `9`
245	L(`8x_end_return_vec_0_1_2_3`):
246	movq %rdx, %rdi
247	L(`8x_return_vec_0_1_2_3`):
248	/ L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64. /
249	# if VEC_SIZE == 64
250	KMOV %k1, %VRCX
251	# endif
252	addq %rdi, %rsi
253	L(return_vec_0_1_2_3):
254	VPTEST %VMM(`1`), %VMM(`1`), %k0
255	KMOV %k0, %VRAX
256	test %VRAX, %VRAX
257	jnz L(return_vec_0)
258
259	VPTEST %VMM(`2`), %VMM(`2`), %k0
260	KMOV %k0, %VRAX
261	test %VRAX, %VRAX
262	jnz L(return_vec_1)
263
264	VPTEST %VMM(`3`), %VMM(`3`), %k0
265	KMOV %k0, %VRAX
266	test %VRAX, %VRAX
267	jnz L(return_vec_2)
268	.p2align `4`,, `2`
269	L(return_vec_3):
270	/ bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one*
271	fetch block and the entire L(return_vec_0_1_2_3) in 1 cache*
272	line. /*
273	bsf %VRCX, %VRCX
274	# ifdef USE_AS_WMEMCMP
275	movl (VEC_SIZE * `3`)(%rdi, %rcx, CHAR_SIZE), %eax
276	xorl %edx, %edx
277	cmpl (VEC_SIZE * `3`)(%rsi, %rcx, CHAR_SIZE), %eax
278	setg %dl
279	leal -`1`(%rdx, %rdx), %eax
280	# else
281	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
282	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %ecx
283	subl %ecx, %eax
284	# endif
285	ret
286
287
288	.p2align `4`,, `8`
289	L(return_vec_1):
290	/ bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one*
291	fetch block. /*
292	bsf %VRAX, %VRAX
293	# ifdef USE_AS_WMEMCMP
294	movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
295	xorl %edx, %edx
296	cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
297	setg %dl
298	leal -`1`(%rdx, %rdx), %eax
299	# else
300	movzbl VEC_SIZE(%rsi, %rax), %ecx
301	movzbl VEC_SIZE(%rdi, %rax), %eax
302	subl %ecx, %eax
303	# endif
304	ret
305
306	.p2align `4`,, `7`
307	L(return_vec_2):
308	/ bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one*
309	fetch block. /*
310	bsf %VRAX, %VRAX
311	# ifdef USE_AS_WMEMCMP
312	movl (VEC_SIZE * `2`)(%rdi, %rax, CHAR_SIZE), %ecx
313	xorl %edx, %edx
314	cmpl (VEC_SIZE * `2`)(%rsi, %rax, CHAR_SIZE), %ecx
315	setg %dl
316	leal -`1`(%rdx, %rdx), %eax
317	# else
318	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
319	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
320	subl %ecx, %eax
321	# endif
322	ret
323
324	.p2align `4`,, `8`
325	L(more_8x_vec):
326	/ Set end of s1 in rdx. /
327	leaq -(VEC_SIZE * `4`)(%rdi, %rdx, CHAR_SIZE), %rdx
328	/ rsi stores s2 - s1. This allows loop to only update one*
329	pointer. /*
330	subq %rdi, %rsi
331	/ Align s1 pointer. /
332	andq $-VEC_SIZE, %rdi
333	/ Adjust because first 4x vec where check already. /
334	subq $-(VEC_SIZE * `4`), %rdi
335
336	.p2align `4`
337	L(loop_4x_vec):
338	VMOVU (%rsi, %rdi), %VMM(`1`)
339	vpxorq (%rdi), %VMM(`1`), %VMM(`1`)
340	VMOVU VEC_SIZE(%rsi, %rdi), %VMM(`2`)
341	vpxorq VEC_SIZE(%rdi), %VMM(`2`), %VMM(`2`)
342	VMOVU (VEC_SIZE * `2`)(%rsi, %rdi), %VMM(`3`)
343	vpxorq (VEC_SIZE * `2`)(%rdi), %VMM(`3`), %VMM(`3`)
344	VMOVU (VEC_SIZE * `3`)(%rsi, %rdi), %VMM(`4`)
345	vpternlogd $`0xde`, (VEC_SIZE * `3`)(%rdi), %VMM(`1`), %VMM(`4`)
346	vpternlogd $`0xfe`, %VMM(`2`), %VMM(`3`), %VMM(`4`)
347	VPTEST %VMM(`4`), %VMM(`4`), %k1
348	/ If VEC_SIZE == 64 just branch with KTEST. We have free port0*
349	space and it allows the loop to fit in 2x cache lines
350	instead of 3. /*
351	# if VEC_SIZE == 64
352	KTEST %k1, %k1
353	# else
354	KMOV %k1, %VRCX
355	test %VRCX, %VRCX
356	# endif
357	jnz L(`8x_return_vec_0_1_2_3`)
358	subq $-(VEC_SIZE * `4`), %rdi
359	cmpq %rdx, %rdi
360	jb L(loop_4x_vec)
361	subq %rdx, %rdi
362	/ rdi has 4 * VEC_SIZE - remaining length. /
363	cmpl $(VEC_SIZE * `3`), %edi
364	jge L(`8x_last_1x_vec`)
365	/ Load regardless of branch. /
366	VMOVU (VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`3`)
367
368	/ Separate logic as we can only use testb for VEC_SIZE == 64.*
369	*/
370	# if VEC_SIZE == 64
371	testb %dil, %dil
372	js L(`8x_last_2x_vec`)
373	# else
374	cmpl $(VEC_SIZE * `2`), %edi
375	jge L(`8x_last_2x_vec`)
376	# endif
377
378	vpxorq (VEC_SIZE * `2`)(%rdx), %VMM(`3`), %VMM(`3`)
379
380	VMOVU (%rsi, %rdx), %VMM(`1`)
381	vpxorq (%rdx), %VMM(`1`), %VMM(`1`)
382
383	VMOVU VEC_SIZE(%rsi, %rdx), %VMM(`2`)
384	vpxorq VEC_SIZE(%rdx), %VMM(`2`), %VMM(`2`)
385	VMOVU (VEC_SIZE * `3`)(%rsi, %rdx), %VMM(`4`)
386	vpternlogd $`0xde`, (VEC_SIZE * `3`)(%rdx), %VMM(`1`), %VMM(`4`)
387	vpternlogd $`0xfe`, %VMM(`2`), %VMM(`3`), %VMM(`4`)
388	VPTEST %VMM(`4`), %VMM(`4`), %k1
389	/ L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in*
390	`k1` if VEC_SIZE == 64. /*
391	# if VEC_SIZE == 64
392	KTEST %k1, %k1
393	# else
394	KMOV %k1, %VRCX
395	test %VRCX, %VRCX
396	# endif
397	jnz L(`8x_end_return_vec_0_1_2_3`)
398	/ NB: eax must be zero to reach here. /
399	ret
400
401	/ Only entry is from L(more_8x_vec). /
402	.p2align `4`,, `6`
403	L(`8x_last_2x_vec`):
404	VPCMP $`4`, (VEC_SIZE * `2`)(%rdx), %VMM(`3`), %k1
405	KMOV %k1, %VRAX
406	test %VRAX, %VRAX
407	jnz L(`8x_return_vec_2`)
408	.p2align `4`,, `5`
409	L(`8x_last_1x_vec`):
410	VMOVU (VEC_SIZE * `3`)(%rsi, %rdx), %VMM(`1`)
411	VPCMP $`4`, (VEC_SIZE * `3`)(%rdx), %VMM(`1`), %k1
412	KMOV %k1, %VRAX
413	test %VRAX, %VRAX
414	jnz L(`8x_return_vec_3`)
415	ret
416
417	/ Not ideally aligned (at offset +9 bytes in fetch block) but*
418	not aligning keeps it in the same cache line as
419	L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
420	size. /*
421	.p2align `4`,, `4`
422	L(`8x_return_vec_2`):
423	subq $VEC_SIZE, %rdx
424	L(`8x_return_vec_3`):
425	bsf %VRAX, %VRAX
426	# ifdef USE_AS_WMEMCMP
427	leaq (%rdx, %rax, CHAR_SIZE), %rax
428	movl (VEC_SIZE * `3`)(%rax), %ecx
429	xorl %edx, %edx
430	cmpl (VEC_SIZE * `3`)(%rsi, %rax), %ecx
431	setg %dl
432	leal -`1`(%rdx, %rdx), %eax
433	# else
434	addq %rdx, %rax
435	movzbl (VEC_SIZE * `3`)(%rsi, %rax), %ecx
436	movzbl (VEC_SIZE * `3`)(%rax), %eax
437	subl %ecx, %eax
438	# endif
439	ret
440
441	.p2align `4`,, `8`
442	L(last_2x_vec):
443	/ Check second to last VEC. /
444	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx, CHAR_SIZE), %VMM(`1`)
445	VPCMP $`4`, -(VEC_SIZE * `2`)(%rdi, %rdx, CHAR_SIZE), %VMM(`1`), %k1
446	KMOV %k1, %VRAX
447	test %VRAX, %VRAX
448	jnz L(return_vec_1_end)
449
450	/ Check last VEC. /
451	.p2align `4`,, `8`
452	L(last_1x_vec):
453	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx, CHAR_SIZE), %VMM(`1`)
454	VPCMP $`4`, -(VEC_SIZE * `1`)(%rdi, %rdx, CHAR_SIZE), %VMM(`1`), %k1
455	KMOV %k1, %VRAX
456	test %VRAX, %VRAX
457	jnz L(return_vec_0_end)
458	ret
459
460
461	/ Don't fully align. Takes 2-fetch blocks either way and*
462	aligning will cause code to spill into another cacheline.
463	*/
464	.p2align `4`,, `3`
465	L(return_vec_1_end):
466	/ Use bsf to save code size. This is necessary to have*
467	L(one_or_less) fit in aligning bytes between. /*
468	bsf %VRAX, %VRAX
469	addl %edx, %eax
470	# ifdef USE_AS_WMEMCMP
471	movl -(VEC_SIZE * `2`)(%rdi, %rax, CHAR_SIZE), %ecx
472	xorl %edx, %edx
473	cmpl -(VEC_SIZE * `2`)(%rsi, %rax, CHAR_SIZE), %ecx
474	setg %dl
475	leal -`1`(%rdx, %rdx), %eax
476	# else
477	movzbl -(VEC_SIZE * `2`)(%rsi, %rax), %ecx
478	movzbl -(VEC_SIZE * `2`)(%rdi, %rax), %eax
479	subl %ecx, %eax
480	# endif
481	ret
482
483	.p2align `4`,, `2`
484	/ Don't align. Takes 2-fetch blocks either way and aligning*
485	will cause code to spill into another cacheline. /*
486	L(return_vec_0_end):
487	bsf %VRAX, %VRAX
488	addl %edx, %eax
489	# ifdef USE_AS_WMEMCMP
490	movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
491	xorl %edx, %edx
492	cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
493	setg %dl
494	leal -`1`(%rdx, %rdx), %eax
495	# else
496	movzbl -VEC_SIZE(%rsi, %rax), %ecx
497	movzbl -VEC_SIZE(%rdi, %rax), %eax
498	subl %ecx, %eax
499	# endif
500	ret
501	/ evex256: 2-byte until next cache line. evex512: 46-bytes*
502	until next cache line. /*
503	END (MEMCMP)
504	#endif
505

source code of glibc/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S