memrchr-evex.S source code [glibc/sysdeps/x86_64/multiarch/memrchr-evex.S]

1	/ memrchr optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23	# include <sysdep.h>
24
25	# ifndef VEC_SIZE
26	# include "x86-evex256-vecs.h"
27	# endif
28
29	# include "reg-macros.h"
30
31	# ifndef MEMRCHR
32	# define MEMRCHR __memrchr_evex
33	# endif
34
35	# define PAGE_SIZE 4096
36	# define VMATCH VMM(0)
37
38	.section SECTION(.text), "ax", @progbits
39	ENTRY_P2ALIGN(MEMRCHR, `6`)
40	# ifdef __ILP32__
41	/ Clear upper bits. /
42	and %RDX_LP, %RDX_LP
43	# else
44	test %RDX_LP, %RDX_LP
45	# endif
46	jz L(zero_0)
47
48	/ Get end pointer. Minus one for three reasons. 1) It is*
49	necessary for a correct page cross check and 2) it correctly
50	sets up end ptr to be subtract by lzcnt aligned. 3) it is a
51	necessary step in aligning ptr. /*
52	leaq -`1`(%rdi, %rdx), %rax
53	vpbroadcastb %esi, %VMATCH
54
55	/ Check if we can load 1x VEC without cross a page. /
56	testl $(PAGE_SIZE - VEC_SIZE), %eax
57	jz L(page_cross)
58
59	/ Don't use rax for pointer here because EVEX has better*
60	encoding with offset % VEC_SIZE == 0. /*
61	vpcmpeqb (VEC_SIZE * -`1`)(%rdi, %rdx), %VMATCH, %k0
62	KMOV %k0, %VRCX
63
64	/ If rcx is zero then lzcnt -> VEC_SIZE. NB: there is a*
65	already a dependency between rcx and rsi so no worries about
66	false-dep here. /*
67	lzcnt %VRCX, %VRSI
68	/ If rdx <= rsi then either 1) rcx was non-zero (there was a*
69	match) but it was out of bounds or 2) rcx was zero and rdx
70	was <= VEC_SIZE so we are done scanning. /*
71	cmpq %rsi, %rdx
72	/ NB: Use branch to return zero/non-zero. Common usage will*
73	branch on result of function (if return is null/non-null).
74	This branch can be used to predict the ensuing one so there
75	is no reason to extend the data-dependency with cmovcc. /*
76	jbe L(zero_0)
77
78	/ If rcx is zero then len must be > RDX, otherwise since we*
79	already tested len vs lzcnt(rcx) (in rsi) we are good to
80	return this match. /*
81	test %VRCX, %VRCX
82	jz L(more_1x_vec)
83	subq %rsi, %rax
84	ret
85
86	/ Fits in aligning bytes of first cache line for VEC_SIZE ==*
87	32. /*
88	# if VEC_SIZE == 32
89	.p2align `4`,, `2`
90	L(zero_0):
91	xorl %eax, %eax
92	ret
93	# endif
94
95	.p2align `4`,, `10`
96	L(more_1x_vec):
97	/ Align rax (pointer to string). /
98	andq $-VEC_SIZE, %rax
99	L(page_cross_continue):
100	/ Recompute length after aligning. /
101	subq %rdi, %rax
102
103	cmpq $(VEC_SIZE * `2`), %rax
104	ja L(more_2x_vec)
105
106	L(last_2x_vec):
107	vpcmpeqb (VEC_SIZE * -`1`)(%rdi, %rax), %VMATCH, %k0
108	KMOV %k0, %VRCX
109
110	test %VRCX, %VRCX
111	jnz L(ret_vec_x0_test)
112
113	/ If VEC_SIZE == 64 need to subtract because lzcntq won't*
114	implicitly add VEC_SIZE to match position. /*
115	# if VEC_SIZE == 64
116	subl $VEC_SIZE, %eax
117	# else
118	cmpb $VEC_SIZE, %al
119	# endif
120	jle L(zero_2)
121
122	/ We adjusted rax (length) for VEC_SIZE == 64 so need separate*
123	offsets. /*
124	# if VEC_SIZE == 64
125	vpcmpeqb (VEC_SIZE * -`1`)(%rdi, %rax), %VMATCH, %k0
126	# else
127	vpcmpeqb (VEC_SIZE * -`2`)(%rdi, %rax), %VMATCH, %k0
128	# endif
129	KMOV %k0, %VRCX
130	/ NB: 64-bit lzcnt. This will naturally add 32 to position for*
131	VEC_SIZE == 32. /*
132	lzcntq %rcx, %rcx
133	subl %ecx, %eax
134	ja L(first_vec_x1_ret)
135	/ If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the*
136	first cache line (this is the second cache line). /*
137	# if VEC_SIZE == 64
138	L(zero_0):
139	# endif
140	L(zero_2):
141	xorl %eax, %eax
142	ret
143
144	/ NB: Fits in aligning bytes before next cache line for*
145	VEC_SIZE == 32. For VEC_SIZE == 64 this is attached to
146	L(first_vec_x0_test). /*
147	# if VEC_SIZE == 32
148	L(first_vec_x1_ret):
149	leaq -`1`(%rdi, %rax), %rax
150	ret
151	# endif
152
153	.p2align `4`,, `6`
154	L(ret_vec_x0_test):
155	lzcnt %VRCX, %VRCX
156	subl %ecx, %eax
157	jle L(zero_2)
158	# if VEC_SIZE == 64
159	/ Reuse code at the end of L(ret_vec_x0_test) as we can't fit*
160	L(first_vec_x1_ret) in the same cache line as its jmp base
161	so we might as well save code size. /*
162	L(first_vec_x1_ret):
163	# endif
164	leaq -`1`(%rdi, %rax), %rax
165	ret
166
167	.p2align `4`,, `6`
168	L(loop_last_4x_vec):
169	/ Compute remaining length. /
170	subl %edi, %eax
171	L(last_4x_vec):
172	cmpl $(VEC_SIZE * `2`), %eax
173	jle L(last_2x_vec)
174	# if VEC_SIZE == 32
175	/ Only align for VEC_SIZE == 32. For VEC_SIZE == 64 we need*
176	the spare bytes to align the loop properly. /*
177	.p2align `4`,, `10`
178	# endif
179	L(more_2x_vec):
180
181	/ Length > VEC_SIZE * 2 so check the first 2x VEC for match and*
182	return if either hit. /*
183	vpcmpeqb (VEC_SIZE * -`1`)(%rdi, %rax), %VMATCH, %k0
184	KMOV %k0, %VRCX
185
186	test %VRCX, %VRCX
187	jnz L(first_vec_x0)
188
189	vpcmpeqb (VEC_SIZE * -`2`)(%rdi, %rax), %VMATCH, %k0
190	KMOV %k0, %VRCX
191	test %VRCX, %VRCX
192	jnz L(first_vec_x1)
193
194	/ Need no matter what. /
195	vpcmpeqb (VEC_SIZE * -`3`)(%rdi, %rax), %VMATCH, %k0
196	KMOV %k0, %VRCX
197
198	/ Check if we are near the end. /
199	subq $(VEC_SIZE * `4`), %rax
200	ja L(more_4x_vec)
201
202	test %VRCX, %VRCX
203	jnz L(first_vec_x2_test)
204
205	/ Adjust length for final check and check if we are at the end.*
206	*/
207	addl $(VEC_SIZE * `1`), %eax
208	jle L(zero_1)
209
210	vpcmpeqb (VEC_SIZE * -`1`)(%rdi, %rax), %VMATCH, %k0
211	KMOV %k0, %VRCX
212
213	lzcnt %VRCX, %VRCX
214	subl %ecx, %eax
215	ja L(first_vec_x3_ret)
216	L(zero_1):
217	xorl %eax, %eax
218	ret
219	L(first_vec_x3_ret):
220	leaq -`1`(%rdi, %rax), %rax
221	ret
222
223	.p2align `4`,, `6`
224	L(first_vec_x2_test):
225	/ Must adjust length before check. /
226	subl $-(VEC_SIZE * `2` - `1`), %eax
227	lzcnt %VRCX, %VRCX
228	subl %ecx, %eax
229	jl L(zero_4)
230	addq %rdi, %rax
231	ret
232
233
234	.p2align `4`,, `10`
235	L(first_vec_x0):
236	bsr %VRCX, %VRCX
237	leaq (VEC_SIZE * -`1`)(%rdi, %rax), %rax
238	addq %rcx, %rax
239	ret
240
241	/ Fits unobtrusively here. /
242	L(zero_4):
243	xorl %eax, %eax
244	ret
245
246	.p2align `4`,, `10`
247	L(first_vec_x1):
248	bsr %VRCX, %VRCX
249	leaq (VEC_SIZE * -`2`)(%rdi, %rax), %rax
250	addq %rcx, %rax
251	ret
252
253	.p2align `4`,, `8`
254	L(first_vec_x3):
255	bsr %VRCX, %VRCX
256	addq %rdi, %rax
257	addq %rcx, %rax
258	ret
259
260	.p2align `4`,, `6`
261	L(first_vec_x2):
262	bsr %VRCX, %VRCX
263	leaq (VEC_SIZE * `1`)(%rdi, %rax), %rax
264	addq %rcx, %rax
265	ret
266
267	.p2align `4`,, `2`
268	L(more_4x_vec):
269	test %VRCX, %VRCX
270	jnz L(first_vec_x2)
271
272	vpcmpeqb (%rdi, %rax), %VMATCH, %k0
273	KMOV %k0, %VRCX
274
275	test %VRCX, %VRCX
276	jnz L(first_vec_x3)
277
278	/ Check if near end before re-aligning (otherwise might do an*
279	unnecessary loop iteration). /*
280	cmpq $(VEC_SIZE * `4`), %rax
281	jbe L(last_4x_vec)
282
283
284	/ NB: We setup the loop to NOT use index-address-mode for the*
285	buffer. This costs some instructions & code size but avoids
286	stalls due to unlaminated micro-fused instructions (as used
287	in the loop) from being forced to issue in the same group
288	(essentially narrowing the backend width). /*
289
290	/ Get endptr for loop in rdx. NB: Can't just do while rax > rdi*
291	because lengths that overflow can be valid and break the
292	comparison. /*
293	# if VEC_SIZE == 64
294	/ Use rdx as intermediate to compute rax, this gets us imm8*
295	encoding which just allows the L(more_4x_vec) block to fit
296	in 1 cache-line. /*
297	leaq (VEC_SIZE * `4`)(%rdi), %rdx
298	leaq (VEC_SIZE * -`1`)(%rdx, %rax), %rax
299
300	/ No evex machine has partial register stalls. This can be*
301	replaced with: `andq $(VEC_SIZE -4), %rax/%rdx` if that*
302	changes. /*
303	xorb %al, %al
304	xorb %dl, %dl
305	# else
306	leaq (VEC_SIZE * `3`)(%rdi, %rax), %rax
307	andq $(VEC_SIZE * -`4`), %rax
308	leaq (VEC_SIZE * `4`)(%rdi), %rdx
309	andq $(VEC_SIZE * -`4`), %rdx
310	# endif
311
312
313	.p2align `4`
314	L(loop_4x_vec):
315	/ NB: We could do the same optimization here as we do for*
316	memchr/rawmemchr by using VEX encoding in the loop for access
317	to VEX vpcmpeqb + vpternlogd. Since memrchr is not as hot as
318	memchr it may not be worth the extra code size, but if the
319	need arises it an easy ~15% perf improvement to the loop. /*
320
321	cmpq %rdx, %rax
322	je L(loop_last_4x_vec)
323	/ Store 1 were not-equals and 0 where equals in k1 (used to*
324	mask later on). /*
325	vpcmpb $`4`, (VEC_SIZE * -`1`)(%rax), %VMATCH, %k1
326
327	/ VEC(2/3) will have zero-byte where we found a CHAR. /
328	vpxorq (VEC_SIZE * -`2`)(%rax), %VMATCH, %VMM(`2`)
329	vpxorq (VEC_SIZE * -`3`)(%rax), %VMATCH, %VMM(`3`)
330	vpcmpeqb (VEC_SIZE * -`4`)(%rax), %VMATCH, %k4
331
332	/ Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit*
333	where CHAR is found and VEC(2/3) have zero-byte where CHAR
334	is found. /*
335	vpminub %VMM(`2`), %VMM(`3`), %VMM(`3`){%k1}{z}
336	vptestnmb %VMM(`3`), %VMM(`3`), %k2
337
338	addq $-(VEC_SIZE * `4`), %rax
339
340	/ Any 1s and we found CHAR. /
341	KORTEST %k2, %k4
342	jz L(loop_4x_vec)
343
344
345	/ K1 has non-matches for first VEC. inc; jz will overflow rcx*
346	iff all bytes where non-matches. /*
347	KMOV %k1, %VRCX
348	inc %VRCX
349	jnz L(first_vec_x0_end)
350
351	vptestnmb %VMM(`2`), %VMM(`2`), %k0
352	KMOV %k0, %VRCX
353	test %VRCX, %VRCX
354	jnz L(first_vec_x1_end)
355	KMOV %k2, %VRCX
356
357	/ Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for*
358	returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
359	individually, for VEC_SIZE == 32 we combine them in a single
360	64-bit GPR. /*
361	# if VEC_SIZE == 64
362	test %VRCX, %VRCX
363	jnz L(first_vec_x2_end)
364	KMOV %k4, %VRCX
365	# else
366	/ Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from*
367	VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
368	result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
369	VEC(3) and bsrq will use that position. /*
370	KMOV %k4, %VRSI
371	salq $`32`, %rcx
372	orq %rsi, %rcx
373	# endif
374	bsrq %rcx, %rcx
375	addq %rcx, %rax
376	ret
377
378	.p2align `4`,, `4`
379	L(first_vec_x0_end):
380	/ rcx has 1s at non-matches so we need to `not` it. We used*
381	`inc` to test if zero so use `neg` to complete the `not` so
382	the last 1 bit represent a match. NB: (-x + 1 == ~x). /*
383	neg %VRCX
384	bsr %VRCX, %VRCX
385	leaq (VEC_SIZE * `3`)(%rcx, %rax), %rax
386	ret
387
388	.p2align `4`,, `10`
389	L(first_vec_x1_end):
390	bsr %VRCX, %VRCX
391	leaq (VEC_SIZE * `2`)(%rcx, %rax), %rax
392	ret
393
394	# if VEC_SIZE == 64
395	/ Since we can't combine the last 2x VEC for VEC_SIZE == 64*
396	need return label for it. /*
397	.p2align `4`,, `4`
398	L(first_vec_x2_end):
399	bsr %VRCX, %VRCX
400	leaq (VEC_SIZE * `1`)(%rcx, %rax), %rax
401	ret
402	# endif
403
404
405	.p2align `4`,, `4`
406	L(page_cross):
407	/ only lower bits of eax[log2(VEC_SIZE):0] are set so we can*
408	use movzbl to get the amount of bytes we are checking here.
409	*/
410	movzbl %al, %ecx
411	andq $-VEC_SIZE, %rax
412	vpcmpeqb (%rax), %VMATCH, %k0
413	KMOV %k0, %VRSI
414
415	/ eax was comptued as %rdi + %rdx - 1 so need to add back 1*
416	here. /*
417	leal `1`(%rcx), %r8d
418
419	/ Invert ecx to get shift count for byte matches out of range.*
420	*/
421	notl %ecx
422	shlx %VRCX, %VRSI, %VRSI
423
424	/ if r8 < rdx then the entire [buf, buf + len] is handled in*
425	the page cross case. NB: we can't use the trick here we use
426	in the non page-cross case because we aren't checking full
427	VEC_SIZE. /*
428	cmpq %r8, %rdx
429	ja L(page_cross_check)
430	lzcnt %VRSI, %VRSI
431	subl %esi, %edx
432	ja L(page_cross_ret)
433	xorl %eax, %eax
434	ret
435
436	L(page_cross_check):
437	test %VRSI, %VRSI
438	jz L(page_cross_continue)
439
440	lzcnt %VRSI, %VRSI
441	subl %esi, %edx
442	L(page_cross_ret):
443	leaq -`1`(%rdi, %rdx), %rax
444	ret
445	END(MEMRCHR)
446	#endif
447

source code of glibc/sysdeps/x86_64/multiarch/memrchr-evex.S