memcmpeq-evex.S source code [glibc/sysdeps/x86_64/multiarch/memcmpeq-evex.S]

1	/ __memcmpeq optimized with EVEX.*
2	Copyright (C) 2017-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23	/ __memcmpeq is implemented as:*
24	1. Use ymm vector compares when possible. The only case where
25	vector compares is not possible for when size < VEC_SIZE
26	and loading from either s1 or s2 would cause a page cross.
27	2. Use xmm vector compare when size >= 8 bytes.
28	3. Optimistically compare up to first 4 VEC_SIZE one at a*
29	to check for early mismatches. Only do this if its guaranteed the
30	work is not wasted.
31	4. If size is 8 VEC_SIZE or less, unroll the loop.*
32	5. Compare 4 VEC_SIZE at a time with the aligned first memory*
33	area.
34	6. Use 2 vector compares when size is 2 VEC_SIZE or less.*
35	7. Use 4 vector compares when size is 4 VEC_SIZE or less.*
36	8. Use 8 vector compares when size is 8 VEC_SIZE or less. /
37
38	# include <sysdep.h>
39
40	# ifndef MEMCMPEQ
41	# define MEMCMPEQ __memcmpeq_evex
42	# endif
43
44	# ifndef VEC_SIZE
45	# include "x86-evex256-vecs.h"
46	# endif
47	# include "reg-macros.h"
48
49
50	# if VEC_SIZE == 32
51
52	# define TEST_ZERO_VCMP(reg) inc %VGPR(reg)
53	# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg)
54
55	# define TO_32BIT_P1(reg) /* Do nothing. */
56	# define TO_32BIT_P2(reg) /* Do nothing. */
57	# define TO_32BIT(reg) /* Do nothing. */
58
59	# define VEC_CMP VPCMPEQ
60
61	# elif VEC_SIZE == 64
62
63	# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg)
64	# define TEST_ZERO(reg) neg %VGPR(reg)
65
66
67	/ VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit*
68	int. We have two methods for this. If the mask with branched
69	on, we use `neg` for the branch then `sbb` to get the 32-bit
70	return. If the mask was no branched on, we just use
71	`popcntq`. /*
72	# define TO_32BIT_P1(reg) TEST_ZERO(reg)
73	# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
74	# define TO_32BIT(reg) popcntq %reg, %reg
75
76	# define VEC_CMP VPCMPNEQ
77
78	# else
79	# error "Unsupported VEC_SIZE"
80	# endif
81
82
83	# define VMOVU_MASK vmovdqu8
84	# define VPCMPNEQ vpcmpneqb
85	# define VPCMPEQ vpcmpeqb
86	# define VPTEST vptestmb
87
88	# define PAGE_SIZE 4096
89
90	.section SECTION(.text), "ax", @progbits
91	ENTRY_P2ALIGN (MEMCMPEQ, `6`)
92	# ifdef __ILP32__
93	/ Clear the upper 32 bits. /
94	movl %edx, %edx
95	# endif
96	cmp $VEC_SIZE, %RDX_LP
97	/ Fall through for [0, VEC_SIZE] as its the hottest. /
98	ja L(more_1x_vec)
99
100	/ Create mask of bytes that are guaranteed to be valid because*
101	of length (edx). Using masked movs allows us to skip checks
102	for page crosses/zero size. /*
103	mov $-`1`, %VRAX
104	bzhi %VRDX, %VRAX, %VRAX
105	/ NB: A `jz` might be useful here. Page-faults that are*
106	invalidated by predicate execution (the evex mask) can be
107	very slow. The expectation is this is not the norm so and
108	"most" code will not regularly call 'memcmp' with length = 0
109	and memory that is not wired up. /*
110	KMOV %VRAX, %k2
111
112	/ Use masked loads as VEC_SIZE could page cross where length*
113	(edx) would not. /*
114	VMOVU_MASK (%rsi), %VMM(`2`){%k2}{z}
115	VPCMPNEQ (%rdi), %VMM(`2`), %k1{%k2}
116	KMOV %k1, %VRAX
117	TO_32BIT (VRAX)
118	ret
119
120	.p2align `4`,, `3`
121	L(last_1x_vec):
122	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx), %VMM(`1`)
123	VPCMPNEQ -(VEC_SIZE * `1`)(%rdi, %rdx), %VMM(`1`), %k1
124	KMOV %k1, %VRAX
125	TO_32BIT_P1 (rax)
126	L(return_neq0):
127	TO_32BIT_P2 (rax)
128	ret
129
130
131	.p2align `4`,, `12`
132	L(more_1x_vec):
133	/ From VEC + 1 to 2 * VEC. /
134	VMOVU (%rsi), %VMM(`1`)
135	/ Use compare not equals to directly check for mismatch. /
136	VPCMPNEQ (%rdi), %VMM(`1`), %k1
137	KMOV %k1, %VRAX
138	TEST_ZERO (rax)
139	jnz L(return_neq0)
140
141	cmpq $(VEC_SIZE * `2`), %rdx
142	jbe L(last_1x_vec)
143
144	/ Check second VEC no matter what. /
145	VMOVU VEC_SIZE(%rsi), %VMM(`2`)
146	VPCMPNEQ VEC_SIZE(%rdi), %VMM(`2`), %k1
147	KMOV %k1, %VRAX
148	TEST_ZERO (rax)
149	jnz L(return_neq0)
150
151	/ Less than 4 * VEC. /
152	cmpq $(VEC_SIZE * `4`), %rdx
153	jbe L(last_2x_vec)
154
155	/ Check third and fourth VEC no matter what. /
156	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`3`)
157	VEC_CMP (VEC_SIZE * `2`)(%rdi), %VMM(`3`), %k1
158	KMOV %k1, %VRAX
159	TEST_ZERO_VCMP (rax)
160	jnz L(return_neq0)
161
162	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`4`)
163	VEC_CMP (VEC_SIZE * `3`)(%rdi), %VMM(`4`), %k1
164	KMOV %k1, %VRAX
165	TEST_ZERO_VCMP (rax)
166	jnz L(return_neq0)
167
168	/ Go to 4x VEC loop. /
169	cmpq $(VEC_SIZE * `8`), %rdx
170	ja L(more_8x_vec)
171
172	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
173	branches. /*
174
175	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx), %VMM(`1`)
176	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`2`)
177	addq %rdx, %rdi
178
179	/ Wait to load from s1 until addressed adjust due to*
180	unlamination. /*
181
182	/ vpxor will be all 0s if s1 and s2 are equal. Otherwise it*
183	will have some 1s. /*
184	vpxorq -(VEC_SIZE * `1`)(%rdi), %VMM(`1`), %VMM(`1`)
185	/ Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while*
186	oring with VEC(1). Result is stored in VEC(1). /*
187	vpternlogd $`0xde`, -(VEC_SIZE * `2`)(%rdi), %VMM(`1`), %VMM(`2`)
188
189	cmpl $(VEC_SIZE * `6`), %edx
190	jbe L(`4x_last_2x_vec`)
191
192	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VMM(`3`)
193	vpxorq -(VEC_SIZE * `3`)(%rdi), %VMM(`3`), %VMM(`3`)
194	/ Or together VEC(1), VEC(2), and VEC(3) into VEC(3). /
195	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VMM(`4`)
196	vpxorq -(VEC_SIZE * `4`)(%rdi), %VMM(`4`), %VMM(`4`)
197
198	/ Or together VEC(4), VEC(3), and VEC(2) into VEC(2). /
199	vpternlogd $`0xfe`, %VMM(`4`), %VMM(`3`), %VMM(`2`)
200
201	/ Compare VEC(4) with 0. If any 1s s1 and s2 don't match. /
202	L(`4x_last_2x_vec`):
203	VPTEST %VMM(`2`), %VMM(`2`), %k1
204	KMOV %k1, %VRAX
205	TO_32BIT (VRAX)
206	ret
207
208
209	.p2align `4`,, `10`
210	L(more_8x_vec):
211	/ Set end of s1 in rdx. /
212	leaq -(VEC_SIZE * `4`)(%rdi, %rdx), %rdx
213	/ rsi stores s2 - s1. This allows loop to only update one*
214	pointer. /*
215	subq %rdi, %rsi
216	/ Align s1 pointer. /
217	andq $-VEC_SIZE, %rdi
218	/ Adjust because first 4x vec where check already. /
219	subq $-(VEC_SIZE * `4`), %rdi
220	.p2align `5`,, `12`
221	.p2align `4`,, `8`
222	L(loop_4x_vec):
223	VMOVU (%rsi, %rdi), %VMM(`1`)
224	vpxorq (%rdi), %VMM(`1`), %VMM(`1`)
225
226	VMOVU VEC_SIZE(%rsi, %rdi), %VMM(`2`)
227	vpternlogd $`0xde`, (VEC_SIZE)(%rdi), %VMM(`1`), %VMM(`2`)
228
229	VMOVU (VEC_SIZE * `2`)(%rsi, %rdi), %VMM(`3`)
230	vpxorq (VEC_SIZE * `2`)(%rdi), %VMM(`3`), %VMM(`3`)
231
232	VMOVU (VEC_SIZE * `3`)(%rsi, %rdi), %VMM(`4`)
233	vpxorq (VEC_SIZE * `3`)(%rdi), %VMM(`4`), %VMM(`4`)
234
235	vpternlogd $`0xfe`, %VMM(`2`), %VMM(`3`), %VMM(`4`)
236	VPTEST %VMM(`4`), %VMM(`4`), %k1
237	KMOV %k1, %VRAX
238	TEST_ZERO (rax)
239	jnz L(return_neq2)
240	subq $-(VEC_SIZE * `4`), %rdi
241	cmpq %rdx, %rdi
242	jb L(loop_4x_vec)
243
244	subq %rdx, %rdi
245
246	VMOVU (VEC_SIZE * `3`)(%rsi, %rdx), %VMM(`4`)
247	vpxorq (VEC_SIZE * `3`)(%rdx), %VMM(`4`), %VMM(`4`)
248	/ rdi has 4 * VEC_SIZE - remaining length. /
249
250	/ Load regardless of branch. /
251	VMOVU (VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`3`)
252	/ Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while*
253	oring with VEC(4). Result is stored in VEC(4). /*
254	vpternlogd $`0xf6`, (VEC_SIZE * `2`)(%rdx), %VMM(`3`), %VMM(`4`)
255
256	/ Separate logic as we can only use testb for VEC_SIZE == 64.*
257	*/
258	# if VEC_SIZE == 64
259	testb %dil, %dil
260	js L(`8x_last_2x_vec`)
261	# else
262	cmpl $(VEC_SIZE * `2`), %edi
263	jge L(`8x_last_2x_vec`)
264	# endif
265
266	VMOVU VEC_SIZE(%rsi, %rdx), %VMM(`2`)
267	vpxorq VEC_SIZE(%rdx), %VMM(`2`), %VMM(`2`)
268
269	VMOVU (%rsi, %rdx), %VMM(`1`)
270	vpxorq (%rdx), %VMM(`1`), %VMM(`1`)
271
272	vpternlogd $`0xfe`, %VMM(`1`), %VMM(`2`), %VMM(`4`)
273	L(`8x_last_1x_vec`):
274	L(`8x_last_2x_vec`):
275	VPTEST %VMM(`4`), %VMM(`4`), %k1
276	KMOV %k1, %VRAX
277	TO_32BIT_P1 (rax)
278	L(return_neq2):
279	TO_32BIT_P2 (rax)
280	ret
281
282	.p2align `4`,, `4`
283	L(last_2x_vec):
284	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`1`)
285	vpxorq -(VEC_SIZE * `2`)(%rdi, %rdx), %VMM(`1`), %VMM(`1`)
286	VMOVU -(VEC_SIZE * `1`)(%rsi, %rdx), %VMM(`2`)
287	vpternlogd $`0xde`, -(VEC_SIZE * `1`)(%rdi, %rdx), %VMM(`1`), %VMM(`2`)
288	VPTEST %VMM(`2`), %VMM(`2`), %k1
289	KMOV %k1, %VRAX
290	TO_32BIT (VRAX)
291	ret
292
293	/ evex256: 1 Bytes from next cache line. evex512: 15 Bytes from*
294	next cache line. /*
295	END (MEMCMPEQ)
296	#endif
297

source code of glibc/sysdeps/x86_64/multiarch/memcmpeq-evex.S