1/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
2 Copyright (C) 2013-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19
20#include <sysdep.h>
21
22#ifndef STRRCHR
23# define STRRCHR strrchr
24#endif
25
26#ifdef USE_AS_WCSRCHR
27# define PCMPEQ pcmpeqd
28# define CHAR_SIZE 4
29# define PMINU pminud
30#else
31# define PCMPEQ pcmpeqb
32# define CHAR_SIZE 1
33# define PMINU pminub
34#endif
35
36#define PAGE_SIZE 4096
37#define VEC_SIZE 16
38
39 .text
40ENTRY(STRRCHR)
41 movd %esi, %xmm0
42 movq %rdi, %rax
43 andl $(PAGE_SIZE - 1), %eax
44#ifndef USE_AS_WCSRCHR
45 punpcklbw %xmm0, %xmm0
46 punpcklwd %xmm0, %xmm0
47#endif
48 pshufd $0, %xmm0, %xmm0
49 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
50 ja L(cross_page)
51
52L(cross_page_continue):
53 movups (%rdi), %xmm1
54 pxor %xmm2, %xmm2
55 PCMPEQ %xmm1, %xmm2
56 pmovmskb %xmm2, %ecx
57 testl %ecx, %ecx
58 jz L(aligned_more)
59
60 PCMPEQ %xmm0, %xmm1
61 pmovmskb %xmm1, %eax
62 leal -1(%rcx), %edx
63 xorl %edx, %ecx
64 andl %ecx, %eax
65 jz L(ret0)
66 bsrl %eax, %eax
67 addq %rdi, %rax
68 /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
69 search CHAR is zero we are correct. Either way `andq
70 -CHAR_SIZE, %rax` gets the correct result. */
71#ifdef USE_AS_WCSRCHR
72 andq $-CHAR_SIZE, %rax
73#endif
74L(ret0):
75 ret
76
77 /* Returns for first vec x1/x2 have hard coded backward search
78 path for earlier matches. */
79 .p2align 4
80L(first_vec_x0_test):
81 PCMPEQ %xmm0, %xmm1
82 pmovmskb %xmm1, %eax
83 testl %eax, %eax
84 jz L(ret0)
85 bsrl %eax, %eax
86 addq %r8, %rax
87#ifdef USE_AS_WCSRCHR
88 andq $-CHAR_SIZE, %rax
89#endif
90 ret
91
92 .p2align 4
93L(first_vec_x1):
94 PCMPEQ %xmm0, %xmm2
95 pmovmskb %xmm2, %eax
96 leal -1(%rcx), %edx
97 xorl %edx, %ecx
98 andl %ecx, %eax
99 jz L(first_vec_x0_test)
100 bsrl %eax, %eax
101 leaq (VEC_SIZE)(%rdi, %rax), %rax
102#ifdef USE_AS_WCSRCHR
103 andq $-CHAR_SIZE, %rax
104#endif
105 ret
106
107 .p2align 4
108L(first_vec_x1_test):
109 PCMPEQ %xmm0, %xmm2
110 pmovmskb %xmm2, %eax
111 testl %eax, %eax
112 jz L(first_vec_x0_test)
113 bsrl %eax, %eax
114 leaq (VEC_SIZE)(%rdi, %rax), %rax
115#ifdef USE_AS_WCSRCHR
116 andq $-CHAR_SIZE, %rax
117#endif
118 ret
119
120 .p2align 4
121L(first_vec_x2):
122 PCMPEQ %xmm0, %xmm3
123 pmovmskb %xmm3, %eax
124 leal -1(%rcx), %edx
125 xorl %edx, %ecx
126 andl %ecx, %eax
127 jz L(first_vec_x1_test)
128 bsrl %eax, %eax
129 leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
130#ifdef USE_AS_WCSRCHR
131 andq $-CHAR_SIZE, %rax
132#endif
133 ret
134
135 .p2align 4
136L(aligned_more):
137 /* Save original pointer if match was in VEC 0. */
138 movq %rdi, %r8
139 andq $-VEC_SIZE, %rdi
140
141 movaps VEC_SIZE(%rdi), %xmm2
142 pxor %xmm3, %xmm3
143 PCMPEQ %xmm2, %xmm3
144 pmovmskb %xmm3, %ecx
145 testl %ecx, %ecx
146 jnz L(first_vec_x1)
147
148 movaps (VEC_SIZE * 2)(%rdi), %xmm3
149 pxor %xmm4, %xmm4
150 PCMPEQ %xmm3, %xmm4
151 pmovmskb %xmm4, %ecx
152 testl %ecx, %ecx
153 jnz L(first_vec_x2)
154
155 addq $VEC_SIZE, %rdi
156 /* Save pointer again before realigning. */
157 movq %rdi, %rsi
158 andq $-(VEC_SIZE * 2), %rdi
159 .p2align 4
160L(first_loop):
161 /* Do 2x VEC at a time. */
162 movaps (VEC_SIZE * 2)(%rdi), %xmm4
163 movaps (VEC_SIZE * 3)(%rdi), %xmm5
164 /* Since SSE2 no pminud so wcsrchr needs seperate logic for
165 detecting zero. Note if this is found to be a bottleneck it
166 may be worth adding an SSE4.1 wcsrchr implementation. */
167#ifdef USE_AS_WCSRCHR
168 movaps %xmm5, %xmm6
169 pxor %xmm8, %xmm8
170
171 PCMPEQ %xmm8, %xmm5
172 PCMPEQ %xmm4, %xmm8
173 por %xmm5, %xmm8
174#else
175 movaps %xmm5, %xmm6
176 PMINU %xmm4, %xmm5
177#endif
178
179 movaps %xmm4, %xmm9
180 PCMPEQ %xmm0, %xmm4
181 PCMPEQ %xmm0, %xmm6
182 movaps %xmm6, %xmm7
183 por %xmm4, %xmm6
184#ifndef USE_AS_WCSRCHR
185 pxor %xmm8, %xmm8
186 PCMPEQ %xmm5, %xmm8
187#endif
188 pmovmskb %xmm8, %ecx
189 pmovmskb %xmm6, %eax
190
191 addq $(VEC_SIZE * 2), %rdi
192 /* Use `addl` 1) so we can undo it with `subl` and 2) it can
193 macro-fuse with `jz`. */
194 addl %ecx, %eax
195 jz L(first_loop)
196
197 /* Check if there is zero match. */
198 testl %ecx, %ecx
199 jz L(second_loop_match)
200
201 /* Check if there was a match in last iteration. */
202 subl %ecx, %eax
203 jnz L(new_match)
204
205L(first_loop_old_match):
206 PCMPEQ %xmm0, %xmm2
207 PCMPEQ %xmm0, %xmm3
208 pmovmskb %xmm2, %ecx
209 pmovmskb %xmm3, %eax
210 addl %eax, %ecx
211 jz L(first_vec_x0_test)
212 /* NB: We could move this shift to before the branch and save a
213 bit of code size / performance on the fall through. The
214 branch leads to the null case which generally seems hotter
215 than char in first 3x VEC. */
216 sall $16, %eax
217 orl %ecx, %eax
218
219 bsrl %eax, %eax
220 addq %rsi, %rax
221#ifdef USE_AS_WCSRCHR
222 andq $-CHAR_SIZE, %rax
223#endif
224 ret
225
226 .p2align 4
227L(new_match):
228 pxor %xmm6, %xmm6
229 PCMPEQ %xmm9, %xmm6
230 pmovmskb %xmm6, %eax
231 sall $16, %ecx
232 orl %eax, %ecx
233
234 /* We can't reuse either of the old comparisons as since we mask
235 of zeros after first zero (instead of using the full
236 comparison) we can't gurantee no interference between match
237 after end of string and valid match. */
238 pmovmskb %xmm4, %eax
239 pmovmskb %xmm7, %edx
240 sall $16, %edx
241 orl %edx, %eax
242
243 leal -1(%ecx), %edx
244 xorl %edx, %ecx
245 andl %ecx, %eax
246 jz L(first_loop_old_match)
247 bsrl %eax, %eax
248 addq %rdi, %rax
249#ifdef USE_AS_WCSRCHR
250 andq $-CHAR_SIZE, %rax
251#endif
252 ret
253
254 /* Save minimum state for getting most recent match. We can
255 throw out all previous work. */
256 .p2align 4
257L(second_loop_match):
258 movq %rdi, %rsi
259 movaps %xmm4, %xmm2
260 movaps %xmm7, %xmm3
261
262 .p2align 4
263L(second_loop):
264 movaps (VEC_SIZE * 2)(%rdi), %xmm4
265 movaps (VEC_SIZE * 3)(%rdi), %xmm5
266 /* Since SSE2 no pminud so wcsrchr needs seperate logic for
267 detecting zero. Note if this is found to be a bottleneck it
268 may be worth adding an SSE4.1 wcsrchr implementation. */
269#ifdef USE_AS_WCSRCHR
270 movaps %xmm5, %xmm6
271 pxor %xmm8, %xmm8
272
273 PCMPEQ %xmm8, %xmm5
274 PCMPEQ %xmm4, %xmm8
275 por %xmm5, %xmm8
276#else
277 movaps %xmm5, %xmm6
278 PMINU %xmm4, %xmm5
279#endif
280
281 movaps %xmm4, %xmm9
282 PCMPEQ %xmm0, %xmm4
283 PCMPEQ %xmm0, %xmm6
284 movaps %xmm6, %xmm7
285 por %xmm4, %xmm6
286#ifndef USE_AS_WCSRCHR
287 pxor %xmm8, %xmm8
288 PCMPEQ %xmm5, %xmm8
289#endif
290
291 pmovmskb %xmm8, %ecx
292 pmovmskb %xmm6, %eax
293
294 addq $(VEC_SIZE * 2), %rdi
295 /* Either null term or new occurence of CHAR. */
296 addl %ecx, %eax
297 jz L(second_loop)
298
299 /* No null term so much be new occurence of CHAR. */
300 testl %ecx, %ecx
301 jz L(second_loop_match)
302
303
304 subl %ecx, %eax
305 jnz L(second_loop_new_match)
306
307L(second_loop_old_match):
308 pmovmskb %xmm2, %ecx
309 pmovmskb %xmm3, %eax
310 sall $16, %eax
311 orl %ecx, %eax
312 bsrl %eax, %eax
313 addq %rsi, %rax
314#ifdef USE_AS_WCSRCHR
315 andq $-CHAR_SIZE, %rax
316#endif
317 ret
318
319 .p2align 4
320L(second_loop_new_match):
321 pxor %xmm6, %xmm6
322 PCMPEQ %xmm9, %xmm6
323 pmovmskb %xmm6, %eax
324 sall $16, %ecx
325 orl %eax, %ecx
326
327 /* We can't reuse either of the old comparisons as since we mask
328 of zeros after first zero (instead of using the full
329 comparison) we can't gurantee no interference between match
330 after end of string and valid match. */
331 pmovmskb %xmm4, %eax
332 pmovmskb %xmm7, %edx
333 sall $16, %edx
334 orl %edx, %eax
335
336 leal -1(%ecx), %edx
337 xorl %edx, %ecx
338 andl %ecx, %eax
339 jz L(second_loop_old_match)
340 bsrl %eax, %eax
341 addq %rdi, %rax
342#ifdef USE_AS_WCSRCHR
343 andq $-CHAR_SIZE, %rax
344#endif
345 ret
346
347 .p2align 4,, 4
348L(cross_page):
349 movq %rdi, %rsi
350 andq $-VEC_SIZE, %rsi
351 movaps (%rsi), %xmm1
352 pxor %xmm2, %xmm2
353 PCMPEQ %xmm1, %xmm2
354 pmovmskb %xmm2, %edx
355 movl %edi, %ecx
356 andl $(VEC_SIZE - 1), %ecx
357 sarl %cl, %edx
358 jz L(cross_page_continue)
359 PCMPEQ %xmm0, %xmm1
360 pmovmskb %xmm1, %eax
361 sarl %cl, %eax
362 leal -1(%rdx), %ecx
363 xorl %edx, %ecx
364 andl %ecx, %eax
365 jz L(ret1)
366 bsrl %eax, %eax
367 addq %rdi, %rax
368#ifdef USE_AS_WCSRCHR
369 andq $-CHAR_SIZE, %rax
370#endif
371L(ret1):
372 ret
373END(STRRCHR)
374
375#ifndef USE_AS_WCSRCHR
376 weak_alias (STRRCHR, rindex)
377 libc_hidden_builtin_def (STRRCHR)
378#endif
379

source code of glibc/sysdeps/x86_64/strrchr.S