1/* strrchr/wcsrchr optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRRCHR
24# define STRRCHR __strrchr_avx2
25# endif
26
27# ifdef USE_AS_WCSRCHR
28# define VPBROADCAST vpbroadcastd
29# define VPCMPEQ vpcmpeqd
30# define VPMIN vpminud
31# define CHAR_SIZE 4
32# else
33# define VPBROADCAST vpbroadcastb
34# define VPCMPEQ vpcmpeqb
35# define VPMIN vpminub
36# define CHAR_SIZE 1
37# endif
38
39# ifndef VZEROUPPER
40# define VZEROUPPER vzeroupper
41# endif
42
43# ifndef SECTION
44# define SECTION(p) p##.avx
45# endif
46
47# define VEC_SIZE 32
48# define PAGE_SIZE 4096
49
50 .section SECTION(.text), "ax", @progbits
51ENTRY(STRRCHR)
52 movd %esi, %xmm7
53 movl %edi, %eax
54 /* Broadcast CHAR to YMM4. */
55 VPBROADCAST %xmm7, %ymm7
56 vpxor %xmm0, %xmm0, %xmm0
57
58 /* Shift here instead of `andl` to save code size (saves a fetch
59 block). */
60 sall $20, %eax
61 cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
62 ja L(cross_page)
63
64L(page_cross_continue):
65 vmovdqu (%rdi), %ymm1
66 /* Check end of string match. */
67 VPCMPEQ %ymm1, %ymm0, %ymm6
68 vpmovmskb %ymm6, %ecx
69 testl %ecx, %ecx
70 jz L(aligned_more)
71
72 /* Only check match with search CHAR if needed. */
73 VPCMPEQ %ymm1, %ymm7, %ymm1
74 vpmovmskb %ymm1, %eax
75 /* Check if match before first zero. */
76 blsmskl %ecx, %ecx
77 andl %ecx, %eax
78 jz L(ret0)
79 bsrl %eax, %eax
80 addq %rdi, %rax
81 /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
82 search CHAR is zero we are correct. Either way `andq
83 -CHAR_SIZE, %rax` gets the correct result. */
84# ifdef USE_AS_WCSRCHR
85 andq $-CHAR_SIZE, %rax
86# endif
87L(ret0):
88L(return_vzeroupper):
89 ZERO_UPPER_VEC_REGISTERS_RETURN
90
91 /* Returns for first vec x1/x2 have hard coded backward search
92 path for earlier matches. */
93 .p2align 4,, 10
94L(first_vec_x1):
95 VPCMPEQ %ymm2, %ymm7, %ymm6
96 vpmovmskb %ymm6, %eax
97 blsmskl %ecx, %ecx
98 andl %ecx, %eax
99 jnz L(first_vec_x1_return)
100
101 .p2align 4,, 4
102L(first_vec_x0_test):
103 VPCMPEQ %ymm1, %ymm7, %ymm6
104 vpmovmskb %ymm6, %eax
105 testl %eax, %eax
106 jz L(ret1)
107 bsrl %eax, %eax
108 addq %r8, %rax
109# ifdef USE_AS_WCSRCHR
110 andq $-CHAR_SIZE, %rax
111# endif
112L(ret1):
113 VZEROUPPER_RETURN
114
115 .p2align 4,, 10
116L(first_vec_x0_x1_test):
117 VPCMPEQ %ymm2, %ymm7, %ymm6
118 vpmovmskb %ymm6, %eax
119 /* Check ymm2 for search CHAR match. If no match then check ymm1
120 before returning. */
121 testl %eax, %eax
122 jz L(first_vec_x0_test)
123 .p2align 4,, 4
124L(first_vec_x1_return):
125 bsrl %eax, %eax
126 leaq 1(%rdi, %rax), %rax
127# ifdef USE_AS_WCSRCHR
128 andq $-CHAR_SIZE, %rax
129# endif
130 VZEROUPPER_RETURN
131
132
133 .p2align 4,, 10
134L(first_vec_x2):
135 VPCMPEQ %ymm3, %ymm7, %ymm6
136 vpmovmskb %ymm6, %eax
137 blsmskl %ecx, %ecx
138 /* If no in-range search CHAR match in ymm3 then need to check
139 ymm1/ymm2 for an earlier match (we delay checking search
140 CHAR matches until needed). */
141 andl %ecx, %eax
142 jz L(first_vec_x0_x1_test)
143 bsrl %eax, %eax
144 leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
145# ifdef USE_AS_WCSRCHR
146 andq $-CHAR_SIZE, %rax
147# endif
148 VZEROUPPER_RETURN
149
150
151 .p2align 4
152L(aligned_more):
153 /* Save original pointer if match was in VEC 0. */
154 movq %rdi, %r8
155
156 /* Align src. */
157 orq $(VEC_SIZE - 1), %rdi
158 vmovdqu 1(%rdi), %ymm2
159 VPCMPEQ %ymm2, %ymm0, %ymm6
160 vpmovmskb %ymm6, %ecx
161 testl %ecx, %ecx
162 jnz L(first_vec_x1)
163
164 vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
165 VPCMPEQ %ymm3, %ymm0, %ymm6
166 vpmovmskb %ymm6, %ecx
167 testl %ecx, %ecx
168 jnz L(first_vec_x2)
169
170 /* Save pointer again before realigning. */
171 movq %rdi, %rsi
172 addq $(VEC_SIZE + 1), %rdi
173 andq $-(VEC_SIZE * 2), %rdi
174 .p2align 4
175L(first_aligned_loop):
176 /* Do 2x VEC at a time. Any more and the cost of finding the
177 match outweights loop benefit. */
178 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
179 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
180
181 VPCMPEQ %ymm4, %ymm7, %ymm6
182 VPMIN %ymm4, %ymm5, %ymm8
183 VPCMPEQ %ymm5, %ymm7, %ymm10
184 vpor %ymm6, %ymm10, %ymm5
185 VPCMPEQ %ymm8, %ymm0, %ymm8
186 vpor %ymm5, %ymm8, %ymm9
187
188 vpmovmskb %ymm9, %eax
189 addq $(VEC_SIZE * 2), %rdi
190 /* No zero or search CHAR. */
191 testl %eax, %eax
192 jz L(first_aligned_loop)
193
194 /* If no zero CHAR then go to second loop (this allows us to
195 throw away all prior work). */
196 vpmovmskb %ymm8, %ecx
197 testl %ecx, %ecx
198 jz L(second_aligned_loop_prep)
199
200 /* Search char could be zero so we need to get the true match.
201 */
202 vpmovmskb %ymm5, %eax
203 testl %eax, %eax
204 jnz L(first_aligned_loop_return)
205
206 .p2align 4,, 4
207L(first_vec_x1_or_x2):
208 VPCMPEQ %ymm3, %ymm7, %ymm3
209 VPCMPEQ %ymm2, %ymm7, %ymm2
210 vpmovmskb %ymm3, %eax
211 vpmovmskb %ymm2, %edx
212 /* Use add for macro-fusion. */
213 addq %rax, %rdx
214 jz L(first_vec_x0_test)
215 /* NB: We could move this shift to before the branch and save a
216 bit of code size / performance on the fall through. The
217 branch leads to the null case which generally seems hotter
218 than char in first 3x VEC. */
219 salq $32, %rax
220 addq %rdx, %rax
221 bsrq %rax, %rax
222 leaq 1(%rsi, %rax), %rax
223# ifdef USE_AS_WCSRCHR
224 andq $-CHAR_SIZE, %rax
225# endif
226 VZEROUPPER_RETURN
227
228 .p2align 4,, 8
229L(first_aligned_loop_return):
230 VPCMPEQ %ymm4, %ymm0, %ymm4
231 vpmovmskb %ymm4, %edx
232 salq $32, %rcx
233 orq %rdx, %rcx
234
235 vpmovmskb %ymm10, %eax
236 vpmovmskb %ymm6, %edx
237 salq $32, %rax
238 orq %rdx, %rax
239 blsmskq %rcx, %rcx
240 andq %rcx, %rax
241 jz L(first_vec_x1_or_x2)
242
243 bsrq %rax, %rax
244 leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
245# ifdef USE_AS_WCSRCHR
246 andq $-CHAR_SIZE, %rax
247# endif
248 VZEROUPPER_RETURN
249
250 /* Search char cannot be zero. */
251 .p2align 4
252L(second_aligned_loop_set_furthest_match):
253 /* Save VEC and pointer from most recent match. */
254L(second_aligned_loop_prep):
255 movq %rdi, %rsi
256 vmovdqu %ymm6, %ymm2
257 vmovdqu %ymm10, %ymm3
258
259 .p2align 4
260L(second_aligned_loop):
261 /* Search 2x at at time. */
262 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
263 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
264
265 VPCMPEQ %ymm4, %ymm7, %ymm6
266 VPMIN %ymm4, %ymm5, %ymm1
267 VPCMPEQ %ymm5, %ymm7, %ymm10
268 vpor %ymm6, %ymm10, %ymm5
269 VPCMPEQ %ymm1, %ymm0, %ymm1
270 vpor %ymm5, %ymm1, %ymm9
271
272 vpmovmskb %ymm9, %eax
273 addq $(VEC_SIZE * 2), %rdi
274 testl %eax, %eax
275 jz L(second_aligned_loop)
276 vpmovmskb %ymm1, %ecx
277 testl %ecx, %ecx
278 jz L(second_aligned_loop_set_furthest_match)
279 vpmovmskb %ymm5, %eax
280 testl %eax, %eax
281 jnz L(return_new_match)
282
283 /* This is the hot patch. We know CHAR is inbounds and that
284 ymm3/ymm2 have latest match. */
285 .p2align 4,, 4
286L(return_old_match):
287 vpmovmskb %ymm3, %eax
288 vpmovmskb %ymm2, %edx
289 salq $32, %rax
290 orq %rdx, %rax
291 bsrq %rax, %rax
292 /* Search char cannot be zero so safe to just use lea for
293 wcsrchr. */
294 leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
295 VZEROUPPER_RETURN
296
297 /* Last iteration also potentially has a match. */
298 .p2align 4,, 8
299L(return_new_match):
300 VPCMPEQ %ymm4, %ymm0, %ymm4
301 vpmovmskb %ymm4, %edx
302 salq $32, %rcx
303 orq %rdx, %rcx
304
305 vpmovmskb %ymm10, %eax
306 vpmovmskb %ymm6, %edx
307 salq $32, %rax
308 orq %rdx, %rax
309 blsmskq %rcx, %rcx
310 andq %rcx, %rax
311 jz L(return_old_match)
312 bsrq %rax, %rax
313 /* Search char cannot be zero so safe to just use lea for
314 wcsrchr. */
315 leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
316 VZEROUPPER_RETURN
317
318 .p2align 4,, 4
319L(cross_page):
320 movq %rdi, %rsi
321 andq $-VEC_SIZE, %rsi
322 vmovdqu (%rsi), %ymm1
323 VPCMPEQ %ymm1, %ymm0, %ymm6
324 vpmovmskb %ymm6, %ecx
325 /* Shift out zero CHAR matches that are before the begining of
326 src (rdi). */
327 shrxl %edi, %ecx, %ecx
328 testl %ecx, %ecx
329 jz L(page_cross_continue)
330 VPCMPEQ %ymm1, %ymm7, %ymm1
331 vpmovmskb %ymm1, %eax
332
333 /* Shift out search CHAR matches that are before the begining of
334 src (rdi). */
335 shrxl %edi, %eax, %eax
336 blsmskl %ecx, %ecx
337 /* Check if any search CHAR match in range. */
338 andl %ecx, %eax
339 jz L(ret2)
340 bsrl %eax, %eax
341 addq %rdi, %rax
342# ifdef USE_AS_WCSRCHR
343 andq $-CHAR_SIZE, %rax
344# endif
345L(ret2):
346 VZEROUPPER_RETURN
347END(STRRCHR)
348#endif
349

source code of glibc/sysdeps/x86_64/multiarch/strrchr-avx2.S