1/* Optimized memchr with sse2
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
29 cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS 4
35# define STR1 PARMS
36# define STR2 STR1+4
37
38# ifndef USE_AS_RAWMEMCHR
39# define LEN STR2+4
40# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
41# endif
42
43# ifndef MEMCHR
44# define MEMCHR __memchr_sse2_bsf
45# endif
46
47 .text
48ENTRY (MEMCHR)
49
50 mov STR1(%esp), %ecx
51 movd STR2(%esp), %xmm1
52
53# ifndef USE_AS_RAWMEMCHR
54 mov LEN(%esp), %edx
55 test %edx, %edx
56 jz L(return_null_1)
57# endif
58 mov %ecx, %eax
59
60 punpcklbw %xmm1, %xmm1
61 punpcklbw %xmm1, %xmm1
62
63 and $63, %ecx
64 pshufd $0, %xmm1, %xmm1
65
66 cmp $48, %ecx
67 ja L(crosscache)
68
69 movdqu (%eax), %xmm0
70 pcmpeqb %xmm1, %xmm0
71/* Check if there is a match. */
72 pmovmskb %xmm0, %ecx
73 test %ecx, %ecx
74 je L(unaligned_no_match_1)
75/* Check which byte is a match. */
76 bsf %ecx, %ecx
77
78# ifndef USE_AS_RAWMEMCHR
79 sub %ecx, %edx
80 jbe L(return_null_1)
81# endif
82 add %ecx, %eax
83 ret
84
85 .p2align 4
86L(unaligned_no_match_1):
87# ifndef USE_AS_RAWMEMCHR
88 sub $16, %edx
89 jbe L(return_null_1)
90 PUSH (%edi)
91 lea 16(%eax), %edi
92 and $15, %eax
93 and $-16, %edi
94 add %eax, %edx
95# else
96 lea 16(%eax), %edx
97 and $-16, %edx
98# endif
99 jmp L(loop_prolog)
100
101 .p2align 4
102L(return_null_1):
103 xor %eax, %eax
104 ret
105
106# ifndef USE_AS_RAWMEMCHR
107 CFI_POP (%edi)
108# endif
109
110 .p2align 4
111L(crosscache):
112/* Handle unaligned string. */
113
114# ifndef USE_AS_RAWMEMCHR
115 PUSH (%edi)
116 mov %eax, %edi
117 and $15, %ecx
118 and $-16, %edi
119 movdqa (%edi), %xmm0
120# else
121 mov %eax, %edx
122 and $15, %ecx
123 and $-16, %edx
124 movdqa (%edx), %xmm0
125# endif
126 pcmpeqb %xmm1, %xmm0
127/* Check if there is a match. */
128 pmovmskb %xmm0, %eax
129/* Remove the leading bytes. */
130 sar %cl, %eax
131 test %eax, %eax
132 je L(unaligned_no_match)
133/* Check which byte is a match. */
134 bsf %eax, %eax
135
136# ifndef USE_AS_RAWMEMCHR
137 sub %eax, %edx
138 jbe L(return_null)
139 add %edi, %eax
140 add %ecx, %eax
141 RETURN
142# else
143 add %edx, %eax
144 add %ecx, %eax
145 ret
146# endif
147
148 .p2align 4
149L(unaligned_no_match):
150# ifndef USE_AS_RAWMEMCHR
151 /* Calculate the last acceptable address and check for possible
152 addition overflow by using satured math:
153 edx = ecx + edx
154 edx |= -(edx < ecx) */
155 add %ecx, %edx
156 sbb %eax, %eax
157 or %eax, %edx
158 sub $16, %edx
159 jbe L(return_null)
160 add $16, %edi
161# else
162 add $16, %edx
163# endif
164
165 .p2align 4
166/* Loop start on aligned string. */
167L(loop_prolog):
168# ifndef USE_AS_RAWMEMCHR
169 sub $64, %edx
170 jbe L(exit_loop)
171 movdqa (%edi), %xmm0
172# else
173 movdqa (%edx), %xmm0
174# endif
175 pcmpeqb %xmm1, %xmm0
176 pmovmskb %xmm0, %eax
177 test %eax, %eax
178 jnz L(matches)
179
180# ifndef USE_AS_RAWMEMCHR
181 movdqa 16(%edi), %xmm2
182# else
183 movdqa 16(%edx), %xmm2
184# endif
185 pcmpeqb %xmm1, %xmm2
186 pmovmskb %xmm2, %eax
187 test %eax, %eax
188 jnz L(matches16)
189
190# ifndef USE_AS_RAWMEMCHR
191 movdqa 32(%edi), %xmm3
192# else
193 movdqa 32(%edx), %xmm3
194# endif
195 pcmpeqb %xmm1, %xmm3
196 pmovmskb %xmm3, %eax
197 test %eax, %eax
198 jnz L(matches32)
199
200# ifndef USE_AS_RAWMEMCHR
201 movdqa 48(%edi), %xmm4
202# else
203 movdqa 48(%edx), %xmm4
204# endif
205 pcmpeqb %xmm1, %xmm4
206
207# ifndef USE_AS_RAWMEMCHR
208 add $64, %edi
209# else
210 add $64, %edx
211# endif
212 pmovmskb %xmm4, %eax
213 test %eax, %eax
214 jnz L(matches0)
215
216# ifndef USE_AS_RAWMEMCHR
217 test $0x3f, %edi
218# else
219 test $0x3f, %edx
220# endif
221 jz L(align64_loop)
222
223# ifndef USE_AS_RAWMEMCHR
224 sub $64, %edx
225 jbe L(exit_loop)
226 movdqa (%edi), %xmm0
227# else
228 movdqa (%edx), %xmm0
229# endif
230 pcmpeqb %xmm1, %xmm0
231 pmovmskb %xmm0, %eax
232 test %eax, %eax
233 jnz L(matches)
234
235# ifndef USE_AS_RAWMEMCHR
236 movdqa 16(%edi), %xmm2
237# else
238 movdqa 16(%edx), %xmm2
239# endif
240 pcmpeqb %xmm1, %xmm2
241 pmovmskb %xmm2, %eax
242 test %eax, %eax
243 jnz L(matches16)
244
245# ifndef USE_AS_RAWMEMCHR
246 movdqa 32(%edi), %xmm3
247# else
248 movdqa 32(%edx), %xmm3
249# endif
250 pcmpeqb %xmm1, %xmm3
251 pmovmskb %xmm3, %eax
252 test %eax, %eax
253 jnz L(matches32)
254
255# ifndef USE_AS_RAWMEMCHR
256 movdqa 48(%edi), %xmm3
257# else
258 movdqa 48(%edx), %xmm3
259# endif
260 pcmpeqb %xmm1, %xmm3
261 pmovmskb %xmm3, %eax
262
263# ifndef USE_AS_RAWMEMCHR
264 add $64, %edi
265# else
266 add $64, %edx
267# endif
268 test %eax, %eax
269 jnz L(matches0)
270
271# ifndef USE_AS_RAWMEMCHR
272 mov %edi, %ecx
273 and $-64, %edi
274 and $63, %ecx
275 add %ecx, %edx
276# else
277 and $-64, %edx
278# endif
279
280 .p2align 4
281L(align64_loop):
282# ifndef USE_AS_RAWMEMCHR
283 sub $64, %edx
284 jbe L(exit_loop)
285 movdqa (%edi), %xmm0
286 movdqa 16(%edi), %xmm2
287 movdqa 32(%edi), %xmm3
288 movdqa 48(%edi), %xmm4
289# else
290 movdqa (%edx), %xmm0
291 movdqa 16(%edx), %xmm2
292 movdqa 32(%edx), %xmm3
293 movdqa 48(%edx), %xmm4
294# endif
295 pcmpeqb %xmm1, %xmm0
296 pcmpeqb %xmm1, %xmm2
297 pcmpeqb %xmm1, %xmm3
298 pcmpeqb %xmm1, %xmm4
299
300 pmaxub %xmm0, %xmm3
301 pmaxub %xmm2, %xmm4
302 pmaxub %xmm3, %xmm4
303 pmovmskb %xmm4, %eax
304
305# ifndef USE_AS_RAWMEMCHR
306 add $64, %edi
307# else
308 add $64, %edx
309# endif
310
311 test %eax, %eax
312 jz L(align64_loop)
313
314# ifndef USE_AS_RAWMEMCHR
315 sub $64, %edi
316# else
317 sub $64, %edx
318# endif
319
320 pmovmskb %xmm0, %eax
321 test %eax, %eax
322 jnz L(matches)
323
324 pmovmskb %xmm2, %eax
325 test %eax, %eax
326 jnz L(matches16)
327
328# ifndef USE_AS_RAWMEMCHR
329 movdqa 32(%edi), %xmm3
330# else
331 movdqa 32(%edx), %xmm3
332# endif
333
334 pcmpeqb %xmm1, %xmm3
335
336# ifndef USE_AS_RAWMEMCHR
337 pcmpeqb 48(%edi), %xmm1
338# else
339 pcmpeqb 48(%edx), %xmm1
340# endif
341 pmovmskb %xmm3, %eax
342 test %eax, %eax
343 jnz L(matches32)
344
345 pmovmskb %xmm1, %eax
346 bsf %eax, %eax
347
348# ifndef USE_AS_RAWMEMCHR
349 lea 48(%edi, %eax), %eax
350 RETURN
351# else
352 lea 48(%edx, %eax), %eax
353 ret
354# endif
355
356# ifndef USE_AS_RAWMEMCHR
357 .p2align 4
358L(exit_loop):
359 add $64, %edx
360 cmp $32, %edx
361 jbe L(exit_loop_32)
362
363 movdqa (%edi), %xmm0
364 pcmpeqb %xmm1, %xmm0
365 pmovmskb %xmm0, %eax
366 test %eax, %eax
367 jnz L(matches)
368
369 movdqa 16(%edi), %xmm2
370 pcmpeqb %xmm1, %xmm2
371 pmovmskb %xmm2, %eax
372 test %eax, %eax
373 jnz L(matches16)
374
375 movdqa 32(%edi), %xmm3
376 pcmpeqb %xmm1, %xmm3
377 pmovmskb %xmm3, %eax
378 test %eax, %eax
379 jnz L(matches32_1)
380 cmp $48, %edx
381 jbe L(return_null)
382
383 pcmpeqb 48(%edi), %xmm1
384 pmovmskb %xmm1, %eax
385 test %eax, %eax
386 jnz L(matches48_1)
387 xor %eax, %eax
388 RETURN
389
390 .p2align 4
391L(exit_loop_32):
392 movdqa (%edi), %xmm0
393 pcmpeqb %xmm1, %xmm0
394 pmovmskb %xmm0, %eax
395 test %eax, %eax
396 jnz L(matches_1)
397 cmp $16, %edx
398 jbe L(return_null)
399
400 pcmpeqb 16(%edi), %xmm1
401 pmovmskb %xmm1, %eax
402 test %eax, %eax
403 jnz L(matches16_1)
404 xor %eax, %eax
405 RETURN
406# endif
407 .p2align 4
408L(matches0):
409 bsf %eax, %eax
410# ifndef USE_AS_RAWMEMCHR
411 lea -16(%eax, %edi), %eax
412 RETURN
413# else
414 lea -16(%eax, %edx), %eax
415 ret
416# endif
417
418 .p2align 4
419L(matches):
420 bsf %eax, %eax
421# ifndef USE_AS_RAWMEMCHR
422 add %edi, %eax
423 RETURN
424# else
425 add %edx, %eax
426 ret
427# endif
428
429 .p2align 4
430L(matches16):
431 bsf %eax, %eax
432# ifndef USE_AS_RAWMEMCHR
433 lea 16(%eax, %edi), %eax
434 RETURN
435# else
436 lea 16(%eax, %edx), %eax
437 ret
438# endif
439
440 .p2align 4
441L(matches32):
442 bsf %eax, %eax
443# ifndef USE_AS_RAWMEMCHR
444 lea 32(%eax, %edi), %eax
445 RETURN
446# else
447 lea 32(%eax, %edx), %eax
448 ret
449# endif
450
451# ifndef USE_AS_RAWMEMCHR
452 .p2align 4
453L(matches_1):
454 bsf %eax, %eax
455 sub %eax, %edx
456 jbe L(return_null)
457
458 add %edi, %eax
459 RETURN
460
461 .p2align 4
462L(matches16_1):
463 sub $16, %edx
464 bsf %eax, %eax
465 sub %eax, %edx
466 jbe L(return_null)
467
468 lea 16(%edi, %eax), %eax
469 RETURN
470
471 .p2align 4
472L(matches32_1):
473 sub $32, %edx
474 bsf %eax, %eax
475 sub %eax, %edx
476 jbe L(return_null)
477
478 lea 32(%edi, %eax), %eax
479 RETURN
480
481 .p2align 4
482L(matches48_1):
483 sub $48, %edx
484 bsf %eax, %eax
485 sub %eax, %edx
486 jbe L(return_null)
487
488 lea 48(%edi, %eax), %eax
489 RETURN
490# endif
491 .p2align 4
492L(return_null):
493 xor %eax, %eax
494# ifndef USE_AS_RAWMEMCHR
495 RETURN
496# else
497 ret
498# endif
499
500END (MEMCHR)
501#endif
502

source code of glibc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S