1/* Optimized memrchr with sse2
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
29 cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS 4
35# define STR1 PARMS
36# define STR2 STR1+4
37# define LEN STR2+4
38
39# define MEMCHR __memrchr_sse2_bsf
40
41 .text
42ENTRY (MEMCHR)
43 mov STR1(%esp), %ecx
44 movd STR2(%esp), %xmm1
45 mov LEN(%esp), %edx
46
47 sub $16, %edx
48 jbe L(length_less16)
49
50 punpcklbw %xmm1, %xmm1
51 add %edx, %ecx
52 punpcklbw %xmm1, %xmm1
53
54 movdqu (%ecx), %xmm0
55 pshufd $0, %xmm1, %xmm1
56 pcmpeqb %xmm1, %xmm0
57
58/* Check if there is a match. */
59 pmovmskb %xmm0, %eax
60 test %eax, %eax
61 jnz L(matches0)
62
63 sub $64, %ecx
64 mov %ecx, %eax
65 and $15, %eax
66 jz L(loop_prolog)
67
68 add $16, %ecx
69 add $16, %edx
70 sub %eax, %ecx
71 sub %eax, %edx
72
73 .p2align 4
74/* Loop start on aligned string. */
75L(loop_prolog):
76 sub $64, %edx
77 jbe L(exit_loop)
78
79 movdqa 48(%ecx), %xmm0
80 pcmpeqb %xmm1, %xmm0
81 pmovmskb %xmm0, %eax
82 test %eax, %eax
83 jnz L(matches48)
84
85 movdqa 32(%ecx), %xmm2
86 pcmpeqb %xmm1, %xmm2
87 pmovmskb %xmm2, %eax
88 test %eax, %eax
89 jnz L(matches32)
90
91 movdqa 16(%ecx), %xmm3
92 pcmpeqb %xmm1, %xmm3
93 pmovmskb %xmm3, %eax
94 test %eax, %eax
95 jnz L(matches16)
96
97 movdqa (%ecx), %xmm4
98 pcmpeqb %xmm1, %xmm4
99 pmovmskb %xmm4, %eax
100 test %eax, %eax
101 jnz L(matches0)
102
103 sub $64, %ecx
104 sub $64, %edx
105 jbe L(exit_loop)
106
107 movdqa 48(%ecx), %xmm0
108 pcmpeqb %xmm1, %xmm0
109 pmovmskb %xmm0, %eax
110 test %eax, %eax
111 jnz L(matches48)
112
113 movdqa 32(%ecx), %xmm2
114 pcmpeqb %xmm1, %xmm2
115 pmovmskb %xmm2, %eax
116 test %eax, %eax
117 jnz L(matches32)
118
119 movdqa 16(%ecx), %xmm3
120 pcmpeqb %xmm1, %xmm3
121 pmovmskb %xmm3, %eax
122 test %eax, %eax
123 jnz L(matches16)
124
125 movdqa (%ecx), %xmm3
126 pcmpeqb %xmm1, %xmm3
127 pmovmskb %xmm3, %eax
128 test %eax, %eax
129 jnz L(matches0)
130
131 mov %ecx, %eax
132 and $63, %eax
133 test %eax, %eax
134 jz L(align64_loop)
135
136 add $64, %ecx
137 add $64, %edx
138 sub %eax, %ecx
139 sub %eax, %edx
140
141 .p2align 4
142L(align64_loop):
143 sub $64, %ecx
144 sub $64, %edx
145 jbe L(exit_loop)
146
147 movdqa (%ecx), %xmm0
148 movdqa 16(%ecx), %xmm2
149 movdqa 32(%ecx), %xmm3
150 movdqa 48(%ecx), %xmm4
151
152 pcmpeqb %xmm1, %xmm0
153 pcmpeqb %xmm1, %xmm2
154 pcmpeqb %xmm1, %xmm3
155 pcmpeqb %xmm1, %xmm4
156
157 pmaxub %xmm3, %xmm0
158 pmaxub %xmm4, %xmm2
159 pmaxub %xmm0, %xmm2
160 pmovmskb %xmm2, %eax
161
162 test %eax, %eax
163 jz L(align64_loop)
164
165 pmovmskb %xmm4, %eax
166 test %eax, %eax
167 jnz L(matches48)
168
169 pmovmskb %xmm3, %eax
170 test %eax, %eax
171 jnz L(matches32)
172
173 movdqa 16(%ecx), %xmm2
174
175 pcmpeqb %xmm1, %xmm2
176 pcmpeqb (%ecx), %xmm1
177
178 pmovmskb %xmm2, %eax
179 test %eax, %eax
180 jnz L(matches16)
181
182 pmovmskb %xmm1, %eax
183 bsr %eax, %eax
184
185 add %ecx, %eax
186 ret
187
188 .p2align 4
189L(exit_loop):
190 add $64, %edx
191 cmp $32, %edx
192 jbe L(exit_loop_32)
193
194 movdqa 48(%ecx), %xmm0
195 pcmpeqb %xmm1, %xmm0
196 pmovmskb %xmm0, %eax
197 test %eax, %eax
198 jnz L(matches48)
199
200 movdqa 32(%ecx), %xmm2
201 pcmpeqb %xmm1, %xmm2
202 pmovmskb %xmm2, %eax
203 test %eax, %eax
204 jnz L(matches32)
205
206 movdqa 16(%ecx), %xmm3
207 pcmpeqb %xmm1, %xmm3
208 pmovmskb %xmm3, %eax
209 test %eax, %eax
210 jnz L(matches16_1)
211 cmp $48, %edx
212 jbe L(return_null)
213
214 pcmpeqb (%ecx), %xmm1
215 pmovmskb %xmm1, %eax
216 test %eax, %eax
217 jnz L(matches0_1)
218 xor %eax, %eax
219 ret
220
221 .p2align 4
222L(exit_loop_32):
223 movdqa 48(%ecx), %xmm0
224 pcmpeqb %xmm1, %xmm0
225 pmovmskb %xmm0, %eax
226 test %eax, %eax
227 jnz L(matches48_1)
228 cmp $16, %edx
229 jbe L(return_null)
230
231 pcmpeqb 32(%ecx), %xmm1
232 pmovmskb %xmm1, %eax
233 test %eax, %eax
234 jnz L(matches32_1)
235 xor %eax, %eax
236 ret
237
238 .p2align 4
239L(matches0):
240 bsr %eax, %eax
241 add %ecx, %eax
242 ret
243
244 .p2align 4
245L(matches16):
246 bsr %eax, %eax
247 lea 16(%eax, %ecx), %eax
248 ret
249
250 .p2align 4
251L(matches32):
252 bsr %eax, %eax
253 lea 32(%eax, %ecx), %eax
254 ret
255
256 .p2align 4
257L(matches48):
258 bsr %eax, %eax
259 lea 48(%eax, %ecx), %eax
260 ret
261
262 .p2align 4
263L(matches0_1):
264 bsr %eax, %eax
265 sub $64, %edx
266 add %eax, %edx
267 jl L(return_null)
268 add %ecx, %eax
269 ret
270
271 .p2align 4
272L(matches16_1):
273 bsr %eax, %eax
274 sub $48, %edx
275 add %eax, %edx
276 jl L(return_null)
277 lea 16(%ecx, %eax), %eax
278 ret
279
280 .p2align 4
281L(matches32_1):
282 bsr %eax, %eax
283 sub $32, %edx
284 add %eax, %edx
285 jl L(return_null)
286 lea 32(%ecx, %eax), %eax
287 ret
288
289 .p2align 4
290L(matches48_1):
291 bsr %eax, %eax
292 sub $16, %edx
293 add %eax, %edx
294 jl L(return_null)
295 lea 48(%ecx, %eax), %eax
296 ret
297
298 .p2align 4
299L(return_null):
300 xor %eax, %eax
301 ret
302
303 .p2align 4
304L(length_less16_offset0):
305 mov %dl, %cl
306 pcmpeqb (%eax), %xmm1
307
308 mov $1, %edx
309 sal %cl, %edx
310 sub $1, %edx
311 mov %edx, %ecx
312
313 pmovmskb %xmm1, %edx
314
315 and %ecx, %edx
316 test %edx, %edx
317 jz L(return_null)
318
319 bsr %edx, %ecx
320 add %ecx, %eax
321 ret
322
323 .p2align 4
324L(length_less16):
325 punpcklbw %xmm1, %xmm1
326 mov %ecx, %eax
327 punpcklbw %xmm1, %xmm1
328 add $16, %edx
329 jz L(return_null)
330
331 pshufd $0, %xmm1, %xmm1
332 and $15, %ecx
333 jz L(length_less16_offset0)
334
335 PUSH (%edi)
336 mov %cl, %dh
337 add %dl, %dh
338 and $-16, %eax
339
340 sub $16, %dh
341 ja L(length_less16_part2)
342
343 pcmpeqb (%eax), %xmm1
344 pmovmskb %xmm1, %edi
345
346 sar %cl, %edi
347 add %ecx, %eax
348 mov %dl, %cl
349
350 mov $1, %edx
351 sal %cl, %edx
352 sub $1, %edx
353
354 and %edx, %edi
355 test %edi, %edi
356 jz L(ret_null)
357
358 bsr %edi, %edi
359 add %edi, %eax
360 POP (%edi)
361 ret
362
363 CFI_PUSH (%edi)
364
365 .p2align 4
366L(length_less16_part2):
367 movdqa 16(%eax), %xmm2
368 pcmpeqb %xmm1, %xmm2
369 pmovmskb %xmm2, %edi
370
371 mov %cl, %ch
372
373 mov %dh, %cl
374 mov $1, %edx
375 sal %cl, %edx
376 sub $1, %edx
377
378 and %edx, %edi
379
380 test %edi, %edi
381 jnz L(length_less16_part2_return)
382
383 pcmpeqb (%eax), %xmm1
384 pmovmskb %xmm1, %edi
385
386 mov %ch, %cl
387 sar %cl, %edi
388 test %edi, %edi
389 jz L(ret_null)
390
391 bsr %edi, %edi
392 add %edi, %eax
393 xor %ch, %ch
394 add %ecx, %eax
395 POP (%edi)
396 ret
397
398 CFI_PUSH (%edi)
399
400 .p2align 4
401L(length_less16_part2_return):
402 bsr %edi, %edi
403 lea 16(%eax, %edi), %eax
404 POP (%edi)
405 ret
406
407 CFI_PUSH (%edi)
408
409 .p2align 4
410L(ret_null):
411 xor %eax, %eax
412 POP (%edi)
413 ret
414
415END (MEMCHR)
416#endif
417

source code of glibc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S