strnlen-evex.S source code [glibc/sysdeps/x86_64/multiarch/strnlen-evex.S]

1	/ strnlen/wcsnlen optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2022-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20	#include <sysdep.h>
21
22	#if ISA_SHOULD_BUILD (4)
23
24	# ifndef VEC_SIZE
25	# include "x86-evex256-vecs.h"
26	# endif
27
28
29	# ifndef STRNLEN
30	# define STRNLEN __strnlen_evex
31	# endif
32
33	# ifdef USE_AS_WCSLEN
34	# define VPCMPEQ vpcmpeqd
35	# define VPCMPNEQ vpcmpneqd
36	# define VPTESTN vptestnmd
37	# define VPTEST vptestmd
38	# define VPMINU vpminud
39	# define CHAR_SIZE 4
40
41	# else
42	# define VPCMPEQ vpcmpeqb
43	# define VPCMPNEQ vpcmpneqb
44	# define VPTESTN vptestnmb
45	# define VPTEST vptestmb
46	# define VPMINU vpminub
47	# define CHAR_SIZE 1
48
49	# define REG_WIDTH VEC_SIZE
50	# endif
51
52	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
53
54	# include "reg-macros.h"
55
56	# if CHAR_PER_VEC == 32
57	# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
58	# else
59	# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
60	# endif
61
62
63
64	# if CHAR_PER_VEC == 64
65	# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
66	# else
67	# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
68	# endif
69
70
71	# define XZERO VMM_128(0)
72	# define VZERO VMM(0)
73	# define PAGE_SIZE 4096
74
75	.section SECTION(.text), "ax", @progbits
76	ENTRY_P2ALIGN (STRNLEN, `6`)
77	/ Check zero length. /
78	test %RSI_LP, %RSI_LP
79	jz L(zero)
80	# ifdef __ILP32__
81	/ Clear the upper 32 bits. /
82	movl %esi, %esi
83	# endif
84
85	movl %edi, %eax
86	vpxorq %XZERO, %XZERO, %XZERO
87	andl $(PAGE_SIZE - `1`), %eax
88	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
89	ja L(cross_page_boundary)
90
91	/ Check the first VEC_SIZE bytes. Each bit in K0 represents a*
92	null byte. /*
93	VPCMPEQ (%rdi), %VZERO, %k0
94
95	KMOV %k0, %VRCX
96	movq %rsi, %rax
97
98	/ If src (rcx) is zero, bsf does not change the result. NB:*
99	Must use 64-bit bsf here so that upper bits of len are not
100	cleared. /*
101	bsfq %rcx, %rax
102	/ If rax > CHAR_PER_VEC then rcx must have been zero (no null*
103	CHAR) and rsi must be > CHAR_PER_VEC. /*
104	cmpq $CHAR_PER_VEC, %rax
105	ja L(more_1x_vec)
106	/ Check if first match in bounds. /
107	cmpq %rax, %rsi
108	cmovb %esi, %eax
109	ret
110
111
112	# if CHAR_PER_VEC != 32
113	.p2align `4`,, `2`
114	L(zero):
115	L(max_0):
116	movl %esi, %eax
117	ret
118	# endif
119
120	/* Aligned more for strnlen compares remaining length vs 2 *
121	CHAR_PER_VEC, 4 CHAR_PER_VEC, and 8 * CHAR_PER_VEC before*
122	going to the loop. /*
123	.p2align `4`,, `10`
124	L(more_1x_vec):
125	L(cross_page_continue):
126	/ Compute number of words checked after aligning. /
127	# ifdef USE_AS_WCSLEN
128	/ Need to compute directly for wcslen as CHAR_SIZE * rsi can*
129	overflow. /*
130	movq %rdi, %rax
131	andq $(VEC_SIZE * -`1`), %rdi
132	subq %rdi, %rax
133	sarq $`2`, %rax
134	leaq -(CHAR_PER_VEC * `1`)(%rax, %rsi), %rax
135	# else
136	leaq (VEC_SIZE * -`1`)(%rsi, %rdi), %rax
137	andq $(VEC_SIZE * -`1`), %rdi
138	subq %rdi, %rax
139	# endif
140
141
142	VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
143
144	cmpq $(CHAR_PER_VEC * `2`), %rax
145	ja L(more_2x_vec)
146
147	L(last_2x_vec_or_less):
148	KMOV %k0, %VRDX
149	test %VRDX, %VRDX
150	jnz L(last_vec_check)
151
152	/ Check the end of data. /
153	SUB_SHORT (CHAR_PER_VEC, rax)
154	jbe L(max_0)
155	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %VZERO, %k0
156	KMOV %k0, %VRDX
157	test %VRDX, %VRDX
158	jz L(max_0)
159	/ Best place for LAST_VEC_CHECK if ZMM. /
160	.p2align `4`,, `8`
161	L(last_vec_check):
162	bsf %VRDX, %VRDX
163	sub %eax, %edx
164	lea (%rsi, %rdx), %eax
165	cmovae %esi, %eax
166	ret
167
168	# if CHAR_PER_VEC == 32
169	.p2align `4`,, `2`
170	L(zero):
171	L(max_0):
172	movl %esi, %eax
173	ret
174	# endif
175
176	.p2align `4`,, `8`
177	L(last_4x_vec_or_less):
178	addl $(CHAR_PER_VEC * -`4`), %eax
179	VPCMPEQ (VEC_SIZE * `5`)(%rdi), %VZERO, %k0
180	subq $(VEC_SIZE * -`4`), %rdi
181	cmpl $(CHAR_PER_VEC * `2`), %eax
182	jbe L(last_2x_vec_or_less)
183
184	.p2align `4`,, `6`
185	L(more_2x_vec):
186	/ Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without*
187	rechecking bounds. /*
188
189	KMOV %k0, %VRDX
190
191	test %VRDX, %VRDX
192	jnz L(first_vec_x1)
193
194	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %VZERO, %k0
195	KMOV %k0, %VRDX
196	test %VRDX, %VRDX
197	jnz L(first_vec_x2)
198
199	cmpq $(CHAR_PER_VEC * `4`), %rax
200	ja L(more_4x_vec)
201
202
203	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %VZERO, %k0
204	KMOV %k0, %VRDX
205	addl $(CHAR_PER_VEC * -`2`), %eax
206	test %VRDX, %VRDX
207	jnz L(last_vec_check)
208
209	subl $(CHAR_PER_VEC), %eax
210	jbe L(max_1)
211
212	VPCMPEQ (VEC_SIZE * `4`)(%rdi), %VZERO, %k0
213	KMOV %k0, %VRDX
214
215	test %VRDX, %VRDX
216	jnz L(last_vec_check)
217	L(max_1):
218	movl %esi, %eax
219	ret
220
221	.p2align `4`,, `3`
222	L(first_vec_x2):
223	# if VEC_SIZE == 64
224	/ If VEC_SIZE == 64 we can fit logic for full return label in*
225	spare bytes before next cache line. /*
226	bsf %VRDX, %VRDX
227	sub %eax, %esi
228	leal (CHAR_PER_VEC * `1`)(%rsi, %rdx), %eax
229	ret
230	.p2align `4`,, `6`
231	# else
232	addl $CHAR_PER_VEC, %esi
233	# endif
234	L(first_vec_x1):
235	bsf %VRDX, %VRDX
236	sub %eax, %esi
237	leal (CHAR_PER_VEC * `0`)(%rsi, %rdx), %eax
238	ret
239
240
241	.p2align `4`,, `6`
242	L(first_vec_x4):
243	# if VEC_SIZE == 64
244	/ If VEC_SIZE == 64 we can fit logic for full return label in*
245	spare bytes before next cache line. /*
246	bsf %VRDX, %VRDX
247	sub %eax, %esi
248	leal (CHAR_PER_VEC * `3`)(%rsi, %rdx), %eax
249	ret
250	.p2align `4`,, `6`
251	# else
252	addl $CHAR_PER_VEC, %esi
253	# endif
254	L(first_vec_x3):
255	bsf %VRDX, %VRDX
256	sub %eax, %esi
257	leal (CHAR_PER_VEC * `2`)(%rsi, %rdx), %eax
258	ret
259
260	.p2align `4`,, `5`
261	L(more_4x_vec):
262	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %VZERO, %k0
263	KMOV %k0, %VRDX
264	test %VRDX, %VRDX
265	jnz L(first_vec_x3)
266
267	VPCMPEQ (VEC_SIZE * `4`)(%rdi), %VZERO, %k0
268	KMOV %k0, %VRDX
269	test %VRDX, %VRDX
270	jnz L(first_vec_x4)
271
272	/ Check if at last VEC_SIZE * 4 length before aligning for the*
273	loop. /*
274	cmpq $(CHAR_PER_VEC * `8`), %rax
275	jbe L(last_4x_vec_or_less)
276
277
278	/ Compute number of words checked after aligning. /
279	# ifdef USE_AS_WCSLEN
280	/ Need to compute directly for wcslen as CHAR_SIZE * rsi can*
281	overflow. /*
282	leaq (VEC_SIZE * -`3`)(%rdi), %rdx
283	# else
284	leaq (VEC_SIZE * -`3`)(%rdi, %rax), %rax
285	# endif
286
287	subq $(VEC_SIZE * -`1`), %rdi
288
289	/ Align data to VEC_SIZE * 4. /
290	# if VEC_SIZE == 64
291	/ Saves code size. No evex512 processor has partial register*
292	stalls. If that change this can be replaced with `andq
293	$-(VEC_SIZE 4), %rdi`. /
294	xorb %dil, %dil
295	# else
296	andq $-(VEC_SIZE * `4`), %rdi
297	# endif
298
299	# ifdef USE_AS_WCSLEN
300	subq %rdi, %rdx
301	sarq $`2`, %rdx
302	addq %rdx, %rax
303	# else
304	subq %rdi, %rax
305	# endif
306	/ Compare 4 * VEC at a time forward. /
307	.p2align `4`,, `11`
308	L(loop_4x_vec):
309	VMOVA (VEC_SIZE * `4`)(%rdi), %VMM(`1`)
310	VPMINU (VEC_SIZE * `5`)(%rdi), %VMM(`1`), %VMM(`2`)
311	VMOVA (VEC_SIZE * `6`)(%rdi), %VMM(`3`)
312	VPMINU (VEC_SIZE * `7`)(%rdi), %VMM(`3`), %VMM(`4`)
313	VPTESTN %VMM(`2`), %VMM(`2`), %k0
314	VPTESTN %VMM(`4`), %VMM(`4`), %k2
315	subq $-(VEC_SIZE * `4`), %rdi
316	/ Break if at end of length. /
317	subq $(CHAR_PER_VEC * `4`), %rax
318	jbe L(loop_len_end)
319
320
321	KORTEST %k0, %k2
322	jz L(loop_4x_vec)
323
324
325	L(loop_last_4x_vec):
326	movq %rsi, %rcx
327	subq %rax, %rsi
328	VPTESTN %VMM(`1`), %VMM(`1`), %k1
329	KMOV %k1, %VRDX
330	test %VRDX, %VRDX
331	jnz L(last_vec_x0)
332
333	KMOV %k0, %VRDX
334	test %VRDX, %VRDX
335	jnz L(last_vec_x1)
336
337	VPTESTN %VMM(`3`), %VMM(`3`), %k0
338
339	/ Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for*
340	returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
341	individually, for VEC_SIZE == 32 we combine them in a single
342	64-bit GPR. /*
343	# if CHAR_PER_VEC == 64
344	KMOV %k0, %VRDX
345	test %VRDX, %VRDX
346	jnz L(last_vec_x2)
347	KMOV %k2, %VRDX
348	# else
349	/ We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.*
350	*/
351	kmovd %k2, %edx
352	kmovd %k0, %eax
353	salq $CHAR_PER_VEC, %rdx
354	orq %rax, %rdx
355	# endif
356
357	/ first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.*
358	*/
359	bsfq %rdx, %rdx
360	leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * `4`)(%rsi, %rdx), %rax
361	cmpq %rax, %rcx
362	cmovb %rcx, %rax
363	ret
364
365	/ Handle last 4x VEC after loop. All VECs have been loaded. /
366	.p2align `4`,, `4`
367	L(loop_len_end):
368	KORTEST %k0, %k2
369	jnz L(loop_last_4x_vec)
370	movq %rsi, %rax
371	ret
372
373
374	# if CHAR_PER_VEC == 64
375	/ Since we can't combine the last 2x VEC for VEC_SIZE == 64*
376	need return label for it. /*
377	.p2align `4`,, `8`
378	L(last_vec_x2):
379	bsf %VRDX, %VRDX
380	leaq (CHAR_PER_VEC * -`2`)(%rsi, %rdx), %rax
381	cmpq %rax, %rcx
382	cmovb %rcx, %rax
383	ret
384	# endif
385
386
387	.p2align `4`,, `10`
388	L(last_vec_x1):
389	addq $CHAR_PER_VEC, %rsi
390	L(last_vec_x0):
391	bsf %VRDX, %VRDX
392	leaq (CHAR_PER_VEC * -`4`)(%rsi, %rdx), %rax
393	cmpq %rax, %rcx
394	cmovb %rcx, %rax
395	ret
396
397
398	.p2align `4`,, `8`
399	L(cross_page_boundary):
400	/ Align data to VEC_SIZE. /
401	movq %rdi, %rcx
402	andq $-VEC_SIZE, %rcx
403	VPCMPEQ (%rcx), %VZERO, %k0
404
405	KMOV %k0, %VRCX
406	# ifdef USE_AS_WCSLEN
407	shrl $`2`, %eax
408	andl $(CHAR_PER_VEC - `1`), %eax
409	# endif
410	shrx %VRAX, %VRCX, %VRCX
411
412	negl %eax
413	andl $(CHAR_PER_VEC - `1`), %eax
414	movq %rsi, %rdx
415	bsf %VRCX, %VRDX
416	cmpq %rax, %rdx
417	ja L(cross_page_continue)
418	movl %edx, %eax
419	cmpq %rdx, %rsi
420	cmovb %esi, %eax
421	ret
422	END (STRNLEN)
423	#endif
424

source code of glibc/sysdeps/x86_64/multiarch/strnlen-evex.S