strnlen-evex512.S source code [glibc/sysdeps/x86_64/multiarch/strnlen-evex512.S]

1	/ Placeholder function, not used by any processor at the moment.*
2	Copyright (C) 2022-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#ifndef STRNLEN
20	#define STRNLEN __strnlen_evex512
21	#endif
22
23	#include "x86-evex512-vecs.h"
24	#include "reg-macros.h"
25
26	#include <isa-level.h>
27
28	#if ISA_SHOULD_BUILD (4)
29
30	# include <sysdep.h>
31
32	# ifdef USE_AS_WCSLEN
33	# define VPCMPEQ vpcmpeqd
34	# define VPTESTN vptestnmd
35	# define VPMINU vpminud
36	# define CHAR_SIZE 4
37	# else
38	# define VPCMPEQ vpcmpeqb
39	# define VPTESTN vptestnmb
40	# define VPMINU vpminub
41	# define CHAR_SIZE 1
42	# endif
43
44	# define PAGE_SIZE 4096
45	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
46
47	.section SECTION(.text),"ax",@progbits
48	/ Aligning entry point to 64 byte, provides better performance for*
49	one vector length string. /*
50	ENTRY_P2ALIGN (STRNLEN, `6`)
51	/ Check zero length. /
52	test %RSI_LP, %RSI_LP
53	jz L(ret_max)
54	# ifdef __ILP32__
55	/ Clear the upper 32 bits. /
56	movl %esi, %esi
57	# endif
58
59	movl %edi, %eax
60	vpxorq %VMM_128(`0`), %VMM_128(`0`), %VMM_128(`0`)
61	sall $`20`, %eax
62	cmpl $((PAGE_SIZE - VEC_SIZE) << `20`), %eax
63	ja L(page_cross)
64
65	/ Compare [w]char for null, mask bit will be set for match. /
66	VPCMPEQ (%rdi), %VMM(`0`), %k0
67	KMOV %k0, %VRCX
68	/ Store max length in rax. /
69	mov %rsi, %rax
70	/ If rcx is 0, rax will have max length. We can not use VRCX*
71	and VRAX here for evex256 because, upper 32 bits may be
72	undefined for ecx and eax. /*
73	bsfq %rcx, %rax
74	cmp $CHAR_PER_VEC, %rax
75	ja L(align_more)
76	cmpq %rax, %rsi
77	cmovb %esi, %eax
78	ret
79
80	/ At this point vector max length reached. /
81	.p2align `4`,,`3`
82	L(ret_max):
83	movq %rsi, %rax
84	ret
85
86	L(align_more):
87	mov %rdi, %rax
88	/ Align rax to VEC_SIZE. /
89	andq $-VEC_SIZE, %rax
90	movq %rdi, %rdx
91	subq %rax, %rdx
92	# ifdef USE_AS_WCSLEN
93	shr $`2`, %VRDX
94	# endif
95	/ At this point rdx contains [w]chars already compared. /
96	leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
97	/ At this point rdx contains number of w[char] needs to go.*
98	Now onwards rdx will keep decrementing with each compare. /*
99
100	/ Loop unroll 4 times for 4 vector loop. /
101	VPCMPEQ VEC_SIZE(%rax), %VMM(`0`), %k0
102	subq $-VEC_SIZE, %rax
103	KMOV %k0, %VRCX
104	test %VRCX, %VRCX
105	jnz L(ret_vec_x1)
106
107	subq $CHAR_PER_VEC, %rdx
108	jbe L(ret_max)
109
110	VPCMPEQ VEC_SIZE(%rax), %VMM(`0`), %k0
111	KMOV %k0, %VRCX
112	test %VRCX, %VRCX
113	jnz L(ret_vec_x2)
114
115	subq $CHAR_PER_VEC, %rdx
116	jbe L(ret_max)
117
118	VPCMPEQ (VEC_SIZE * `2`)(%rax), %VMM(`0`), %k0
119	KMOV %k0, %VRCX
120	test %VRCX, %VRCX
121	jnz L(ret_vec_x3)
122
123	subq $CHAR_PER_VEC, %rdx
124	jbe L(ret_max)
125
126	VPCMPEQ (VEC_SIZE * `3`)(%rax), %VMM(`0`), %k0
127	KMOV %k0, %VRCX
128	test %VRCX, %VRCX
129	jnz L(ret_vec_x4)
130
131	subq $CHAR_PER_VEC, %rdx
132	jbe L(ret_max)
133	/ Save pointer before 4 x VEC_SIZE alignment. /
134	movq %rax, %rcx
135
136	/ Align address to VEC_SIZE * 4 for loop. /
137	andq $-(VEC_SIZE * `4`), %rax
138
139	subq %rax, %rcx
140	# ifdef USE_AS_WCSLEN
141	shr $`2`, %VRCX
142	# endif
143	/ rcx contains number of [w]char will be recompared due to*
144	alignment fixes. rdx must be incremented by rcx to offset
145	alignment adjustment. /*
146	addq %rcx, %rdx
147	/ Need jump as we don't want to add/subtract rdx for first*
148	iteration of 4 x VEC_SIZE aligned loop. /*
149
150	.p2align `4`,,`11`
151	L(loop):
152	/ VPMINU and VPCMP combination provide better performance as*
153	compared to alternative combinations. /*
154	VMOVA (VEC_SIZE * `4`)(%rax), %VMM(`1`)
155	VPMINU (VEC_SIZE * `5`)(%rax), %VMM(`1`), %VMM(`2`)
156	VMOVA (VEC_SIZE * `6`)(%rax), %VMM(`3`)
157	VPMINU (VEC_SIZE * `7`)(%rax), %VMM(`3`), %VMM(`4`)
158
159	VPTESTN %VMM(`2`), %VMM(`2`), %k0
160	VPTESTN %VMM(`4`), %VMM(`4`), %k1
161
162	subq $-(VEC_SIZE * `4`), %rax
163	KORTEST %k0, %k1
164
165	jnz L(loopend)
166	subq $(CHAR_PER_VEC * `4`), %rdx
167	ja L(loop)
168	mov %rsi, %rax
169	ret
170
171	L(loopend):
172
173	VPTESTN %VMM(`1`), %VMM(`1`), %k2
174	KMOV %k2, %VRCX
175	test %VRCX, %VRCX
176	jnz L(ret_vec_x1)
177
178	KMOV %k0, %VRCX
179	/ At this point, if k0 is non zero, null char must be in the*
180	second vector. /*
181	test %VRCX, %VRCX
182	jnz L(ret_vec_x2)
183
184	VPTESTN %VMM(`3`), %VMM(`3`), %k3
185	KMOV %k3, %VRCX
186	test %VRCX, %VRCX
187	jnz L(ret_vec_x3)
188	/ At this point null [w]char must be in the fourth vector so no*
189	need to check. /*
190	KMOV %k1, %VRCX
191
192	/ Fourth, third, second vector terminating are pretty much*
193	same, implemented this way to avoid branching and reuse code
194	from pre loop exit condition. /*
195	L(ret_vec_x4):
196	bsf %VRCX, %VRCX
197	subq %rdi, %rax
198	# ifdef USE_AS_WCSLEN
199	subq $-(VEC_SIZE * `3`), %rax
200	shrq $`2`, %rax
201	addq %rcx, %rax
202	# else
203	leaq (VEC_SIZE * `3`)(%rcx, %rax), %rax
204	# endif
205
206	cmpq %rsi, %rax
207	cmovnb %rsi, %rax
208	ret
209
210	L(ret_vec_x3):
211	bsf %VRCX, %VRCX
212	subq %rdi, %rax
213	# ifdef USE_AS_WCSLEN
214	subq $-(VEC_SIZE * `2`), %rax
215	shrq $`2`, %rax
216	addq %rcx, %rax
217	# else
218	leaq (VEC_SIZE * `2`)(%rcx, %rax), %rax
219	# endif
220	cmpq %rsi, %rax
221	cmovnb %rsi, %rax
222	ret
223
224	L(ret_vec_x2):
225	subq $-VEC_SIZE, %rax
226	L(ret_vec_x1):
227	bsf %VRCX, %VRCX
228	subq %rdi, %rax
229	# ifdef USE_AS_WCSLEN
230	shrq $`2`, %rax
231	# endif
232	addq %rcx, %rax
233	cmpq %rsi, %rax
234	cmovnb %rsi, %rax
235	ret
236
237	L(page_cross):
238	mov %rdi, %rax
239	movl %edi, %ecx
240	andl $(VEC_SIZE - `1`), %ecx
241	# ifdef USE_AS_WCSLEN
242	sarl $`2`, %ecx
243	# endif
244	/ ecx contains number of w[char] to be skipped as a result*
245	of address alignment. /*
246	andq $-VEC_SIZE, %rax
247	VPCMPEQ (%rax), %VMM(`0`), %k0
248	KMOV %k0, %VRDX
249	/ Ignore number of character for alignment adjustment. /
250	shr %cl, %VRDX
251	jnz L(page_cross_end)
252	movl $CHAR_PER_VEC, %eax
253	sub %ecx, %eax
254	cmp %rax, %rsi
255	ja L(align_more)
256
257	L(page_cross_end):
258	bsf %VRDX, %VRAX
259	cmpq %rsi, %rax
260	cmovnb %esi, %eax
261	ret
262
263	END (STRNLEN)
264	#endif
265

source code of glibc/sysdeps/x86_64/multiarch/strnlen-evex512.S