1/* Copyright (C) 2011-2022 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3
4 The GNU C Library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
8
9 The GNU C Library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with the GNU C Library; if not, see
16 <https://www.gnu.org/licenses/>. */
17
18#include <sysdep.h>
19
20#ifdef USE_AS_WMEMCHR
21# define MEMCHR wmemchr
22# define PCMPEQ pcmpeqd
23# define CHAR_PER_VEC 4
24#else
25# define MEMCHR memchr
26# define PCMPEQ pcmpeqb
27# define CHAR_PER_VEC 16
28#endif
29
30/* fast SSE2 version with using pmaxub and 64 byte loop */
31
32 .text
33ENTRY(MEMCHR)
34 movd %esi, %xmm1
35 mov %edi, %ecx
36
37#ifdef __ILP32__
38 /* Clear the upper 32 bits. */
39 movl %edx, %edx
40#endif
41#ifdef USE_AS_WMEMCHR
42 test %RDX_LP, %RDX_LP
43 jz L(return_null)
44#else
45 punpcklbw %xmm1, %xmm1
46 test %RDX_LP, %RDX_LP
47 jz L(return_null)
48 punpcklbw %xmm1, %xmm1
49#endif
50
51 and $63, %ecx
52 pshufd $0, %xmm1, %xmm1
53
54 cmp $48, %ecx
55 ja L(crosscache)
56
57 movdqu (%rdi), %xmm0
58 PCMPEQ %xmm1, %xmm0
59 pmovmskb %xmm0, %eax
60 test %eax, %eax
61
62 jnz L(matches_1)
63 sub $CHAR_PER_VEC, %rdx
64 jbe L(return_null)
65 add $16, %rdi
66 and $15, %ecx
67 and $-16, %rdi
68#ifdef USE_AS_WMEMCHR
69 shr $2, %ecx
70#endif
71 add %rcx, %rdx
72 sub $(CHAR_PER_VEC * 4), %rdx
73 jbe L(exit_loop)
74 jmp L(loop_prolog)
75
76 .p2align 4
77L(crosscache):
78 and $15, %ecx
79 and $-16, %rdi
80 movdqa (%rdi), %xmm0
81
82 PCMPEQ %xmm1, %xmm0
83 /* Check if there is a match. */
84 pmovmskb %xmm0, %eax
85 /* Remove the leading bytes. */
86 sar %cl, %eax
87 test %eax, %eax
88 je L(unaligned_no_match)
89 /* Check which byte is a match. */
90 bsf %eax, %eax
91#ifdef USE_AS_WMEMCHR
92 mov %eax, %esi
93 shr $2, %esi
94 sub %rsi, %rdx
95#else
96 sub %rax, %rdx
97#endif
98 jbe L(return_null)
99 add %rdi, %rax
100 add %rcx, %rax
101 ret
102
103 .p2align 4
104L(unaligned_no_match):
105 /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
106 "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
107 possible addition overflow. */
108 neg %rcx
109 add $16, %rcx
110#ifdef USE_AS_WMEMCHR
111 shr $2, %ecx
112#endif
113 sub %rcx, %rdx
114 jbe L(return_null)
115 add $16, %rdi
116 sub $(CHAR_PER_VEC * 4), %rdx
117 jbe L(exit_loop)
118
119 .p2align 4
120L(loop_prolog):
121 movdqa (%rdi), %xmm0
122 PCMPEQ %xmm1, %xmm0
123 pmovmskb %xmm0, %eax
124 test %eax, %eax
125 jnz L(matches)
126
127 movdqa 16(%rdi), %xmm2
128 PCMPEQ %xmm1, %xmm2
129 pmovmskb %xmm2, %eax
130 test %eax, %eax
131 jnz L(matches16)
132
133 movdqa 32(%rdi), %xmm3
134 PCMPEQ %xmm1, %xmm3
135 pmovmskb %xmm3, %eax
136 test %eax, %eax
137 jnz L(matches32)
138
139 movdqa 48(%rdi), %xmm4
140 PCMPEQ %xmm1, %xmm4
141 add $64, %rdi
142 pmovmskb %xmm4, %eax
143 test %eax, %eax
144 jnz L(matches0)
145
146 test $0x3f, %rdi
147 jz L(align64_loop)
148
149 sub $(CHAR_PER_VEC * 4), %rdx
150 jbe L(exit_loop)
151
152 movdqa (%rdi), %xmm0
153 PCMPEQ %xmm1, %xmm0
154 pmovmskb %xmm0, %eax
155 test %eax, %eax
156 jnz L(matches)
157
158 movdqa 16(%rdi), %xmm2
159 PCMPEQ %xmm1, %xmm2
160 pmovmskb %xmm2, %eax
161 test %eax, %eax
162 jnz L(matches16)
163
164 movdqa 32(%rdi), %xmm3
165 PCMPEQ %xmm1, %xmm3
166 pmovmskb %xmm3, %eax
167 test %eax, %eax
168 jnz L(matches32)
169
170 movdqa 48(%rdi), %xmm3
171 PCMPEQ %xmm1, %xmm3
172 pmovmskb %xmm3, %eax
173
174 add $64, %rdi
175 test %eax, %eax
176 jnz L(matches0)
177
178 mov %rdi, %rcx
179 and $-64, %rdi
180 and $63, %ecx
181#ifdef USE_AS_WMEMCHR
182 shr $2, %ecx
183#endif
184 add %rcx, %rdx
185
186 .p2align 4
187L(align64_loop):
188 sub $(CHAR_PER_VEC * 4), %rdx
189 jbe L(exit_loop)
190 movdqa (%rdi), %xmm0
191 movdqa 16(%rdi), %xmm2
192 movdqa 32(%rdi), %xmm3
193 movdqa 48(%rdi), %xmm4
194
195 PCMPEQ %xmm1, %xmm0
196 PCMPEQ %xmm1, %xmm2
197 PCMPEQ %xmm1, %xmm3
198 PCMPEQ %xmm1, %xmm4
199
200 pmaxub %xmm0, %xmm3
201 pmaxub %xmm2, %xmm4
202 pmaxub %xmm3, %xmm4
203 pmovmskb %xmm4, %eax
204
205 add $64, %rdi
206
207 test %eax, %eax
208 jz L(align64_loop)
209
210 sub $64, %rdi
211
212 pmovmskb %xmm0, %eax
213 test %eax, %eax
214 jnz L(matches)
215
216 pmovmskb %xmm2, %eax
217 test %eax, %eax
218 jnz L(matches16)
219
220 movdqa 32(%rdi), %xmm3
221 PCMPEQ %xmm1, %xmm3
222
223 PCMPEQ 48(%rdi), %xmm1
224 pmovmskb %xmm3, %eax
225 test %eax, %eax
226 jnz L(matches32)
227
228 pmovmskb %xmm1, %eax
229 bsf %eax, %eax
230 lea 48(%rdi, %rax), %rax
231 ret
232
233 .p2align 4
234L(exit_loop):
235 add $(CHAR_PER_VEC * 2), %edx
236 jle L(exit_loop_32)
237
238 movdqa (%rdi), %xmm0
239 PCMPEQ %xmm1, %xmm0
240 pmovmskb %xmm0, %eax
241 test %eax, %eax
242 jnz L(matches)
243
244 movdqa 16(%rdi), %xmm2
245 PCMPEQ %xmm1, %xmm2
246 pmovmskb %xmm2, %eax
247 test %eax, %eax
248 jnz L(matches16)
249
250 movdqa 32(%rdi), %xmm3
251 PCMPEQ %xmm1, %xmm3
252 pmovmskb %xmm3, %eax
253 test %eax, %eax
254 jnz L(matches32_1)
255 sub $CHAR_PER_VEC, %edx
256 jle L(return_null)
257
258 PCMPEQ 48(%rdi), %xmm1
259 pmovmskb %xmm1, %eax
260 test %eax, %eax
261 jnz L(matches48_1)
262 xor %eax, %eax
263 ret
264
265 .p2align 4
266L(exit_loop_32):
267 add $(CHAR_PER_VEC * 2), %edx
268 movdqa (%rdi), %xmm0
269 PCMPEQ %xmm1, %xmm0
270 pmovmskb %xmm0, %eax
271 test %eax, %eax
272 jnz L(matches_1)
273 sub $CHAR_PER_VEC, %edx
274 jbe L(return_null)
275
276 PCMPEQ 16(%rdi), %xmm1
277 pmovmskb %xmm1, %eax
278 test %eax, %eax
279 jnz L(matches16_1)
280 xor %eax, %eax
281 ret
282
283 .p2align 4
284L(matches0):
285 bsf %eax, %eax
286 lea -16(%rax, %rdi), %rax
287 ret
288
289 .p2align 4
290L(matches):
291 bsf %eax, %eax
292 add %rdi, %rax
293 ret
294
295 .p2align 4
296L(matches16):
297 bsf %eax, %eax
298 lea 16(%rax, %rdi), %rax
299 ret
300
301 .p2align 4
302L(matches32):
303 bsf %eax, %eax
304 lea 32(%rax, %rdi), %rax
305 ret
306
307 .p2align 4
308L(matches_1):
309 bsf %eax, %eax
310#ifdef USE_AS_WMEMCHR
311 mov %eax, %esi
312 shr $2, %esi
313 sub %rsi, %rdx
314#else
315 sub %rax, %rdx
316#endif
317 jbe L(return_null)
318 add %rdi, %rax
319 ret
320
321 .p2align 4
322L(matches16_1):
323 bsf %eax, %eax
324#ifdef USE_AS_WMEMCHR
325 mov %eax, %esi
326 shr $2, %esi
327 sub %rsi, %rdx
328#else
329 sub %rax, %rdx
330#endif
331 jbe L(return_null)
332 lea 16(%rdi, %rax), %rax
333 ret
334
335 .p2align 4
336L(matches32_1):
337 bsf %eax, %eax
338#ifdef USE_AS_WMEMCHR
339 mov %eax, %esi
340 shr $2, %esi
341 sub %rsi, %rdx
342#else
343 sub %rax, %rdx
344#endif
345 jbe L(return_null)
346 lea 32(%rdi, %rax), %rax
347 ret
348
349 .p2align 4
350L(matches48_1):
351 bsf %eax, %eax
352#ifdef USE_AS_WMEMCHR
353 mov %eax, %esi
354 shr $2, %esi
355 sub %rsi, %rdx
356#else
357 sub %rax, %rdx
358#endif
359 jbe L(return_null)
360 lea 48(%rdi, %rax), %rax
361 ret
362
363 .p2align 4
364L(return_null):
365 xor %eax, %eax
366 ret
367END(MEMCHR)
368
369#ifndef USE_AS_WMEMCHR
370strong_alias (memchr, __memchr)
371libc_hidden_builtin_def(memchr)
372#endif
373

source code of glibc/sysdeps/x86_64/memchr.S