1/* Placeholder function, not used by any processor at the moment.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#ifndef STRNLEN
20#define STRNLEN __strnlen_evex512
21#endif
22
23#include "x86-evex512-vecs.h"
24#include "reg-macros.h"
25
26#include <isa-level.h>
27
28#if ISA_SHOULD_BUILD (4)
29
30# include <sysdep.h>
31
32# ifdef USE_AS_WCSLEN
33# define VPCMPEQ vpcmpeqd
34# define VPTESTN vptestnmd
35# define VPMINU vpminud
36# define CHAR_SIZE 4
37# else
38# define VPCMPEQ vpcmpeqb
39# define VPTESTN vptestnmb
40# define VPMINU vpminub
41# define CHAR_SIZE 1
42# endif
43
44# define PAGE_SIZE 4096
45# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
46
47 .section SECTION(.text),"ax",@progbits
48/* Aligning entry point to 64 byte, provides better performance for
49 one vector length string. */
50ENTRY_P2ALIGN (STRNLEN, 6)
51 /* Check zero length. */
52 test %RSI_LP, %RSI_LP
53 jz L(ret_max)
54# ifdef __ILP32__
55 /* Clear the upper 32 bits. */
56 movl %esi, %esi
57# endif
58
59 movl %edi, %eax
60 vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
61 sall $20, %eax
62 cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
63 ja L(page_cross)
64
65 /* Compare [w]char for null, mask bit will be set for match. */
66 VPCMPEQ (%rdi), %VMM(0), %k0
67 KMOV %k0, %VRCX
68 /* Store max length in rax. */
69 mov %rsi, %rax
70 /* If rcx is 0, rax will have max length. We can not use VRCX
71 and VRAX here for evex256 because, upper 32 bits may be
72 undefined for ecx and eax. */
73 bsfq %rcx, %rax
74 cmp $CHAR_PER_VEC, %rax
75 ja L(align_more)
76 cmpq %rax, %rsi
77 cmovb %esi, %eax
78 ret
79
80 /* At this point vector max length reached. */
81 .p2align 4,,3
82L(ret_max):
83 movq %rsi, %rax
84 ret
85
86L(align_more):
87 mov %rdi, %rax
88 /* Align rax to VEC_SIZE. */
89 andq $-VEC_SIZE, %rax
90 movq %rdi, %rdx
91 subq %rax, %rdx
92# ifdef USE_AS_WCSLEN
93 shr $2, %VRDX
94# endif
95 /* At this point rdx contains [w]chars already compared. */
96 leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
97 /* At this point rdx contains number of w[char] needs to go.
98 Now onwards rdx will keep decrementing with each compare. */
99
100 /* Loop unroll 4 times for 4 vector loop. */
101 VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
102 subq $-VEC_SIZE, %rax
103 KMOV %k0, %VRCX
104 test %VRCX, %VRCX
105 jnz L(ret_vec_x1)
106
107 subq $CHAR_PER_VEC, %rdx
108 jbe L(ret_max)
109
110 VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
111 KMOV %k0, %VRCX
112 test %VRCX, %VRCX
113 jnz L(ret_vec_x2)
114
115 subq $CHAR_PER_VEC, %rdx
116 jbe L(ret_max)
117
118 VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
119 KMOV %k0, %VRCX
120 test %VRCX, %VRCX
121 jnz L(ret_vec_x3)
122
123 subq $CHAR_PER_VEC, %rdx
124 jbe L(ret_max)
125
126 VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
127 KMOV %k0, %VRCX
128 test %VRCX, %VRCX
129 jnz L(ret_vec_x4)
130
131 subq $CHAR_PER_VEC, %rdx
132 jbe L(ret_max)
133 /* Save pointer before 4 x VEC_SIZE alignment. */
134 movq %rax, %rcx
135
136 /* Align address to VEC_SIZE * 4 for loop. */
137 andq $-(VEC_SIZE * 4), %rax
138
139 subq %rax, %rcx
140# ifdef USE_AS_WCSLEN
141 shr $2, %VRCX
142# endif
143 /* rcx contains number of [w]char will be recompared due to
144 alignment fixes. rdx must be incremented by rcx to offset
145 alignment adjustment. */
146 addq %rcx, %rdx
147 /* Need jump as we don't want to add/subtract rdx for first
148 iteration of 4 x VEC_SIZE aligned loop. */
149
150 .p2align 4,,11
151L(loop):
152 /* VPMINU and VPCMP combination provide better performance as
153 compared to alternative combinations. */
154 VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
155 VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
156 VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
157 VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
158
159 VPTESTN %VMM(2), %VMM(2), %k0
160 VPTESTN %VMM(4), %VMM(4), %k1
161
162 subq $-(VEC_SIZE * 4), %rax
163 KORTEST %k0, %k1
164
165 jnz L(loopend)
166 subq $(CHAR_PER_VEC * 4), %rdx
167 ja L(loop)
168 mov %rsi, %rax
169 ret
170
171L(loopend):
172
173 VPTESTN %VMM(1), %VMM(1), %k2
174 KMOV %k2, %VRCX
175 test %VRCX, %VRCX
176 jnz L(ret_vec_x1)
177
178 KMOV %k0, %VRCX
179 /* At this point, if k0 is non zero, null char must be in the
180 second vector. */
181 test %VRCX, %VRCX
182 jnz L(ret_vec_x2)
183
184 VPTESTN %VMM(3), %VMM(3), %k3
185 KMOV %k3, %VRCX
186 test %VRCX, %VRCX
187 jnz L(ret_vec_x3)
188 /* At this point null [w]char must be in the fourth vector so no
189 need to check. */
190 KMOV %k1, %VRCX
191
192 /* Fourth, third, second vector terminating are pretty much
193 same, implemented this way to avoid branching and reuse code
194 from pre loop exit condition. */
195L(ret_vec_x4):
196 bsf %VRCX, %VRCX
197 subq %rdi, %rax
198# ifdef USE_AS_WCSLEN
199 subq $-(VEC_SIZE * 3), %rax
200 shrq $2, %rax
201 addq %rcx, %rax
202# else
203 leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
204# endif
205
206 cmpq %rsi, %rax
207 cmovnb %rsi, %rax
208 ret
209
210L(ret_vec_x3):
211 bsf %VRCX, %VRCX
212 subq %rdi, %rax
213# ifdef USE_AS_WCSLEN
214 subq $-(VEC_SIZE * 2), %rax
215 shrq $2, %rax
216 addq %rcx, %rax
217# else
218 leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
219# endif
220 cmpq %rsi, %rax
221 cmovnb %rsi, %rax
222 ret
223
224L(ret_vec_x2):
225 subq $-VEC_SIZE, %rax
226L(ret_vec_x1):
227 bsf %VRCX, %VRCX
228 subq %rdi, %rax
229# ifdef USE_AS_WCSLEN
230 shrq $2, %rax
231# endif
232 addq %rcx, %rax
233 cmpq %rsi, %rax
234 cmovnb %rsi, %rax
235 ret
236
237L(page_cross):
238 mov %rdi, %rax
239 movl %edi, %ecx
240 andl $(VEC_SIZE - 1), %ecx
241# ifdef USE_AS_WCSLEN
242 sarl $2, %ecx
243# endif
244 /* ecx contains number of w[char] to be skipped as a result
245 of address alignment. */
246 andq $-VEC_SIZE, %rax
247 VPCMPEQ (%rax), %VMM(0), %k0
248 KMOV %k0, %VRDX
249 /* Ignore number of character for alignment adjustment. */
250 shr %cl, %VRDX
251 jnz L(page_cross_end)
252 movl $CHAR_PER_VEC, %eax
253 sub %ecx, %eax
254 cmp %rax, %rsi
255 ja L(align_more)
256
257L(page_cross_end):
258 bsf %VRDX, %VRAX
259 cmpq %rsi, %rax
260 cmovnb %esi, %eax
261 ret
262
263END (STRNLEN)
264#endif
265

source code of glibc/sysdeps/x86_64/multiarch/strnlen-evex512.S