rawmemchr-evex.S source code [glibc/sysdeps/x86_64/multiarch/rawmemchr-evex.S]

1	/ rawmemchr optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2022-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20	#include <sysdep.h>
21
22	#if ISA_SHOULD_BUILD (4)
23
24	# ifndef VEC_SIZE
25	# include "x86-evex256-vecs.h"
26	# endif
27
28	# ifndef RAWMEMCHR
29	# define RAWMEMCHR __rawmemchr_evex
30	# endif
31
32
33	# define PC_SHIFT_GPR rdi
34	# define REG_WIDTH VEC_SIZE
35	# define VPTESTN vptestnmb
36	# define VPBROADCAST vpbroadcastb
37	# define VPMINU vpminub
38	# define VPCMP vpcmpb
39	# define VPCMPEQ vpcmpeqb
40	# define CHAR_SIZE 1
41
42	# include "reg-macros.h"
43
44	/ If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64*
45	doesn't have VEX encoding), use VEX encoding in loop so we
46	can use vpcmpeqb + vptern which is more efficient than the
47	EVEX alternative. /*
48	# if defined USE_IN_RTM \|\| VEC_SIZE == 64
49	# undef COND_VZEROUPPER
50	# undef VZEROUPPER_RETURN
51	# undef VZEROUPPER
52
53
54	# define COND_VZEROUPPER
55	# define VZEROUPPER_RETURN ret
56	# define VZEROUPPER
57
58	# define USE_TERN_IN_LOOP 0
59	# else
60	# define USE_TERN_IN_LOOP 1
61	# undef VZEROUPPER
62	# define VZEROUPPER vzeroupper
63	# endif
64
65	# define CHAR_PER_VEC VEC_SIZE
66
67	# if CHAR_PER_VEC == 64
68
69	# define TAIL_RETURN_LBL first_vec_x2
70	# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
71
72	# define FALLTHROUGH_RETURN_LBL first_vec_x3
73	# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
74
75	# else /* !(CHAR_PER_VEC == 64) */
76
77	# define TAIL_RETURN_LBL first_vec_x3
78	# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
79
80	# define FALLTHROUGH_RETURN_LBL first_vec_x2
81	# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
82	# endif /* !(CHAR_PER_VEC == 64) */
83
84
85	# define VMATCH VMM(0)
86	# define VMATCH_LO VMM_lo(0)
87
88	# define PAGE_SIZE 4096
89
90	.section SECTION(.text), "ax", @progbits
91	ENTRY_P2ALIGN (RAWMEMCHR, `6`)
92	VPBROADCAST %esi, %VMATCH
93	/ Check if we may cross page boundary with one vector load. /
94	movl %edi, %eax
95	andl $(PAGE_SIZE - `1`), %eax
96	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
97	ja L(page_cross)
98
99	VPCMPEQ (%rdi), %VMATCH, %k0
100	KMOV %k0, %VRAX
101
102	test %VRAX, %VRAX
103	jz L(aligned_more)
104	L(first_vec_x0):
105	bsf %VRAX, %VRAX
106	addq %rdi, %rax
107	ret
108
109	.p2align `4`,, `4`
110	L(first_vec_x4):
111	bsf %VRAX, %VRAX
112	leaq (VEC_SIZE * `4`)(%rdi, %rax), %rax
113	ret
114
115	/ For VEC_SIZE == 32 we can fit this in aligning bytes so might*
116	as well place it more locally. For VEC_SIZE == 64 we reuse
117	return code at the end of loop's return. /*
118	# if VEC_SIZE == 32
119	.p2align `4`,, `4`
120	L(FALLTHROUGH_RETURN_LBL):
121	bsf %VRAX, %VRAX
122	leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
123	ret
124	# endif
125
126	.p2align `4`,, `6`
127	L(page_cross):
128	/ eax has lower page-offset bits of rdi so xor will zero them*
129	out. /*
130	xorq %rdi, %rax
131	VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
132	KMOV %k0, %VRAX
133
134	/ Shift out out-of-bounds matches. /
135	shrx %VRDI, %VRAX, %VRAX
136	test %VRAX, %VRAX
137	jnz L(first_vec_x0)
138
139	.p2align `4`,, `10`
140	L(aligned_more):
141	L(page_cross_continue):
142	/ Align pointer. /
143	andq $(VEC_SIZE * -`1`), %rdi
144
145	VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
146	KMOV %k0, %VRAX
147	test %VRAX, %VRAX
148	jnz L(first_vec_x1)
149
150	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %VMATCH, %k0
151	KMOV %k0, %VRAX
152	test %VRAX, %VRAX
153	jnz L(first_vec_x2)
154
155	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %VMATCH, %k0
156	KMOV %k0, %VRAX
157	test %VRAX, %VRAX
158	jnz L(first_vec_x3)
159
160	VPCMPEQ (VEC_SIZE * `4`)(%rdi), %VMATCH, %k0
161	KMOV %k0, %VRAX
162	test %VRAX, %VRAX
163	jnz L(first_vec_x4)
164
165	subq $-(VEC_SIZE * `1`), %rdi
166	# if VEC_SIZE == 64
167	/ Saves code size. No evex512 processor has partial register*
168	stalls. If that change this can be replaced with `andq
169	$-(VEC_SIZE 4), %rdi`. /
170	xorb %dil, %dil
171	# else
172	andq $-(VEC_SIZE * `4`), %rdi
173	# endif
174
175	# if USE_TERN_IN_LOOP
176	/ copy VMATCH to low ymm so we can use vpcmpeq which is not*
177	encodable with EVEX registers. NB: this is VEC_SIZE == 32
178	only as there is no way to encode vpcmpeq with zmm0-15. /*
179	vmovdqa64 %VMATCH, %VMATCH_LO
180	# endif
181
182	.p2align `4`
183	L(loop_4x_vec):
184	/ Two versions of the loop. One that does not require*
185	vzeroupper by not using ymm0-15 and another does that
186	require vzeroupper because it uses ymm0-15. The reason why
187	ymm0-15 is used at all is because there is no EVEX encoding
188	vpcmpeq and with vpcmpeq this loop can be performed more
189	efficiently. The non-vzeroupper version is safe for RTM
190	while the vzeroupper version should be preferred if RTM are
191	not supported. Which loop version we use is determined by
192	USE_TERN_IN_LOOP. /*
193
194	# if USE_TERN_IN_LOOP
195	/ Since vptern can only take 3x vectors fastest to do 1 vec*
196	separately with EVEX vpcmp. /*
197	VPCMPEQ (VEC_SIZE * `4`)(%rdi), %VMATCH, %k1
198	/ Compare 3x with vpcmpeq and or them all together with vptern.*
199	*/
200
201	VPCMPEQ (VEC_SIZE * `5`)(%rdi), %VMATCH_LO, %VMM_lo(`2`)
202	subq $(VEC_SIZE * -`4`), %rdi
203	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %VMATCH_LO, %VMM_lo(`3`)
204	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %VMATCH_LO, %VMM_lo(`4`)
205
206	/ 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into*
207	VEC_lo(4). /*
208	vpternlogd $`254`, %VMM_lo(`2`), %VMM_lo(`3`), %VMM_lo(`4`)
209	vpmovmskb %VMM_lo(`4`), %VRCX
210
211	KMOV %k1, %eax
212
213	/ NB: rax has match from first VEC and rcx has matches from*
214	VEC 2-4. If rax is non-zero we will return that match. If
215	rax is zero adding won't disturb the bits in rcx. /*
216	add %rax, %rcx
217	# else
218	/ Loop version that uses EVEX encoding. /
219	VPCMP $`4`, (VEC_SIZE * `4`)(%rdi), %VMATCH, %k1
220	vpxorq (VEC_SIZE * `5`)(%rdi), %VMATCH, %VMM(`2`)
221	vpxorq (VEC_SIZE * `6`)(%rdi), %VMATCH, %VMM(`3`)
222	VPCMPEQ (VEC_SIZE * `7`)(%rdi), %VMATCH, %k3
223	VPMINU %VMM(`2`), %VMM(`3`), %VMM(`3`){%k1}{z}
224	VPTESTN %VMM(`3`), %VMM(`3`), %k2
225	subq $(VEC_SIZE * -`4`), %rdi
226	KORTEST %k2, %k3
227	# endif
228	jz L(loop_4x_vec)
229
230	# if USE_TERN_IN_LOOP
231	test %VRAX, %VRAX
232	# else
233	KMOV %k1, %VRAX
234	inc %VRAX
235	# endif
236	jnz L(last_vec_x0)
237
238
239	# if USE_TERN_IN_LOOP
240	vpmovmskb %VMM_lo(`2`), %VRAX
241	# else
242	VPTESTN %VMM(`2`), %VMM(`2`), %k1
243	KMOV %k1, %VRAX
244	# endif
245	test %VRAX, %VRAX
246	jnz L(last_vec_x1)
247
248
249	# if USE_TERN_IN_LOOP
250	vpmovmskb %VMM_lo(`3`), %VRAX
251	# else
252	KMOV %k2, %VRAX
253	# endif
254
255	/ No longer need any of the lo vecs (ymm0-15) so vzeroupper*
256	(only if used VEX encoded loop). /*
257	COND_VZEROUPPER
258
259	/ Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for*
260	returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
261	individually, for VEC_SIZE == 32 we combine them in a single
262	64-bit GPR. /*
263	# if CHAR_PER_VEC == 64
264	# if USE_TERN_IN_LOOP
265	# error "Unsupported"
266	# endif
267
268
269	/ If CHAR_PER_VEC == 64 we can't combine the last two VEC. /
270	test %VRAX, %VRAX
271	jnz L(first_vec_x2)
272	KMOV %k3, %VRAX
273	L(FALLTHROUGH_RETURN_LBL):
274	# else
275	/ CHAR_PER_VEC <= 32 so we can combine the results from the*
276	last 2x VEC. /*
277	# if !USE_TERN_IN_LOOP
278	KMOV %k3, %VRCX
279	# endif
280	salq $CHAR_PER_VEC, %rcx
281	addq %rcx, %rax
282	# endif
283	bsf %rax, %rax
284	leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
285	ret
286
287	.p2align `4`,, `8`
288	L(TAIL_RETURN_LBL):
289	bsf %rax, %rax
290	leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
291	ret
292
293	.p2align `4`,, `8`
294	L(last_vec_x1):
295	COND_VZEROUPPER
296	L(first_vec_x1):
297	bsf %VRAX, %VRAX
298	leaq (VEC_SIZE * `1`)(%rdi, %rax), %rax
299	ret
300
301	.p2align `4`,, `8`
302	L(last_vec_x0):
303	COND_VZEROUPPER
304	bsf %VRAX, %VRAX
305	addq %rdi, %rax
306	ret
307	END (RAWMEMCHR)
308	#endif
309

source code of glibc/sysdeps/x86_64/multiarch/rawmemchr-evex.S