1 | /* Copyright (C) 2011-2022 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2.1 of the License, or (at your option) any later version. |
8 | |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with the GNU C Library; if not, see |
16 | <https://www.gnu.org/licenses/>. */ |
17 | |
18 | #include <sysdep.h> |
19 | |
20 | #ifdef USE_AS_WMEMCHR |
21 | # define MEMCHR wmemchr |
22 | # define PCMPEQ pcmpeqd |
23 | # define CHAR_PER_VEC 4 |
24 | #else |
25 | # define MEMCHR memchr |
26 | # define PCMPEQ pcmpeqb |
27 | # define CHAR_PER_VEC 16 |
28 | #endif |
29 | |
30 | /* fast SSE2 version with using pmaxub and 64 byte loop */ |
31 | |
32 | .text |
33 | ENTRY(MEMCHR) |
34 | movd %esi, %xmm1 |
35 | mov %edi, %ecx |
36 | |
37 | #ifdef __ILP32__ |
38 | /* Clear the upper 32 bits. */ |
39 | movl %edx, %edx |
40 | #endif |
41 | #ifdef USE_AS_WMEMCHR |
42 | test %RDX_LP, %RDX_LP |
43 | jz L(return_null) |
44 | #else |
45 | punpcklbw %xmm1, %xmm1 |
46 | test %RDX_LP, %RDX_LP |
47 | jz L(return_null) |
48 | punpcklbw %xmm1, %xmm1 |
49 | #endif |
50 | |
51 | and $63, %ecx |
52 | pshufd $0, %xmm1, %xmm1 |
53 | |
54 | cmp $48, %ecx |
55 | ja L(crosscache) |
56 | |
57 | movdqu (%rdi), %xmm0 |
58 | PCMPEQ %xmm1, %xmm0 |
59 | pmovmskb %xmm0, %eax |
60 | test %eax, %eax |
61 | |
62 | jnz L(matches_1) |
63 | sub $CHAR_PER_VEC, %rdx |
64 | jbe L(return_null) |
65 | add $16, %rdi |
66 | and $15, %ecx |
67 | and $-16, %rdi |
68 | #ifdef USE_AS_WMEMCHR |
69 | shr $2, %ecx |
70 | #endif |
71 | add %rcx, %rdx |
72 | sub $(CHAR_PER_VEC * 4), %rdx |
73 | jbe L(exit_loop) |
74 | jmp L(loop_prolog) |
75 | |
76 | .p2align 4 |
77 | L(crosscache): |
78 | and $15, %ecx |
79 | and $-16, %rdi |
80 | movdqa (%rdi), %xmm0 |
81 | |
82 | PCMPEQ %xmm1, %xmm0 |
83 | /* Check if there is a match. */ |
84 | pmovmskb %xmm0, %eax |
85 | /* Remove the leading bytes. */ |
86 | sar %cl, %eax |
87 | test %eax, %eax |
88 | je L(unaligned_no_match) |
89 | /* Check which byte is a match. */ |
90 | bsf %eax, %eax |
91 | #ifdef USE_AS_WMEMCHR |
92 | mov %eax, %esi |
93 | shr $2, %esi |
94 | sub %rsi, %rdx |
95 | #else |
96 | sub %rax, %rdx |
97 | #endif |
98 | jbe L(return_null) |
99 | add %rdi, %rax |
100 | add %rcx, %rax |
101 | ret |
102 | |
103 | .p2align 4 |
104 | L(unaligned_no_match): |
105 | /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using |
106 | "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void |
107 | possible addition overflow. */ |
108 | neg %rcx |
109 | add $16, %rcx |
110 | #ifdef USE_AS_WMEMCHR |
111 | shr $2, %ecx |
112 | #endif |
113 | sub %rcx, %rdx |
114 | jbe L(return_null) |
115 | add $16, %rdi |
116 | sub $(CHAR_PER_VEC * 4), %rdx |
117 | jbe L(exit_loop) |
118 | |
119 | .p2align 4 |
120 | L(loop_prolog): |
121 | movdqa (%rdi), %xmm0 |
122 | PCMPEQ %xmm1, %xmm0 |
123 | pmovmskb %xmm0, %eax |
124 | test %eax, %eax |
125 | jnz L(matches) |
126 | |
127 | movdqa 16(%rdi), %xmm2 |
128 | PCMPEQ %xmm1, %xmm2 |
129 | pmovmskb %xmm2, %eax |
130 | test %eax, %eax |
131 | jnz L(matches16) |
132 | |
133 | movdqa 32(%rdi), %xmm3 |
134 | PCMPEQ %xmm1, %xmm3 |
135 | pmovmskb %xmm3, %eax |
136 | test %eax, %eax |
137 | jnz L(matches32) |
138 | |
139 | movdqa 48(%rdi), %xmm4 |
140 | PCMPEQ %xmm1, %xmm4 |
141 | add $64, %rdi |
142 | pmovmskb %xmm4, %eax |
143 | test %eax, %eax |
144 | jnz L(matches0) |
145 | |
146 | test $0x3f, %rdi |
147 | jz L(align64_loop) |
148 | |
149 | sub $(CHAR_PER_VEC * 4), %rdx |
150 | jbe L(exit_loop) |
151 | |
152 | movdqa (%rdi), %xmm0 |
153 | PCMPEQ %xmm1, %xmm0 |
154 | pmovmskb %xmm0, %eax |
155 | test %eax, %eax |
156 | jnz L(matches) |
157 | |
158 | movdqa 16(%rdi), %xmm2 |
159 | PCMPEQ %xmm1, %xmm2 |
160 | pmovmskb %xmm2, %eax |
161 | test %eax, %eax |
162 | jnz L(matches16) |
163 | |
164 | movdqa 32(%rdi), %xmm3 |
165 | PCMPEQ %xmm1, %xmm3 |
166 | pmovmskb %xmm3, %eax |
167 | test %eax, %eax |
168 | jnz L(matches32) |
169 | |
170 | movdqa 48(%rdi), %xmm3 |
171 | PCMPEQ %xmm1, %xmm3 |
172 | pmovmskb %xmm3, %eax |
173 | |
174 | add $64, %rdi |
175 | test %eax, %eax |
176 | jnz L(matches0) |
177 | |
178 | mov %rdi, %rcx |
179 | and $-64, %rdi |
180 | and $63, %ecx |
181 | #ifdef USE_AS_WMEMCHR |
182 | shr $2, %ecx |
183 | #endif |
184 | add %rcx, %rdx |
185 | |
186 | .p2align 4 |
187 | L(align64_loop): |
188 | sub $(CHAR_PER_VEC * 4), %rdx |
189 | jbe L(exit_loop) |
190 | movdqa (%rdi), %xmm0 |
191 | movdqa 16(%rdi), %xmm2 |
192 | movdqa 32(%rdi), %xmm3 |
193 | movdqa 48(%rdi), %xmm4 |
194 | |
195 | PCMPEQ %xmm1, %xmm0 |
196 | PCMPEQ %xmm1, %xmm2 |
197 | PCMPEQ %xmm1, %xmm3 |
198 | PCMPEQ %xmm1, %xmm4 |
199 | |
200 | pmaxub %xmm0, %xmm3 |
201 | pmaxub %xmm2, %xmm4 |
202 | pmaxub %xmm3, %xmm4 |
203 | pmovmskb %xmm4, %eax |
204 | |
205 | add $64, %rdi |
206 | |
207 | test %eax, %eax |
208 | jz L(align64_loop) |
209 | |
210 | sub $64, %rdi |
211 | |
212 | pmovmskb %xmm0, %eax |
213 | test %eax, %eax |
214 | jnz L(matches) |
215 | |
216 | pmovmskb %xmm2, %eax |
217 | test %eax, %eax |
218 | jnz L(matches16) |
219 | |
220 | movdqa 32(%rdi), %xmm3 |
221 | PCMPEQ %xmm1, %xmm3 |
222 | |
223 | PCMPEQ 48(%rdi), %xmm1 |
224 | pmovmskb %xmm3, %eax |
225 | test %eax, %eax |
226 | jnz L(matches32) |
227 | |
228 | pmovmskb %xmm1, %eax |
229 | bsf %eax, %eax |
230 | lea 48(%rdi, %rax), %rax |
231 | ret |
232 | |
233 | .p2align 4 |
234 | L(exit_loop): |
235 | add $(CHAR_PER_VEC * 2), %edx |
236 | jle L(exit_loop_32) |
237 | |
238 | movdqa (%rdi), %xmm0 |
239 | PCMPEQ %xmm1, %xmm0 |
240 | pmovmskb %xmm0, %eax |
241 | test %eax, %eax |
242 | jnz L(matches) |
243 | |
244 | movdqa 16(%rdi), %xmm2 |
245 | PCMPEQ %xmm1, %xmm2 |
246 | pmovmskb %xmm2, %eax |
247 | test %eax, %eax |
248 | jnz L(matches16) |
249 | |
250 | movdqa 32(%rdi), %xmm3 |
251 | PCMPEQ %xmm1, %xmm3 |
252 | pmovmskb %xmm3, %eax |
253 | test %eax, %eax |
254 | jnz L(matches32_1) |
255 | sub $CHAR_PER_VEC, %edx |
256 | jle L(return_null) |
257 | |
258 | PCMPEQ 48(%rdi), %xmm1 |
259 | pmovmskb %xmm1, %eax |
260 | test %eax, %eax |
261 | jnz L(matches48_1) |
262 | xor %eax, %eax |
263 | ret |
264 | |
265 | .p2align 4 |
266 | L(exit_loop_32): |
267 | add $(CHAR_PER_VEC * 2), %edx |
268 | movdqa (%rdi), %xmm0 |
269 | PCMPEQ %xmm1, %xmm0 |
270 | pmovmskb %xmm0, %eax |
271 | test %eax, %eax |
272 | jnz L(matches_1) |
273 | sub $CHAR_PER_VEC, %edx |
274 | jbe L(return_null) |
275 | |
276 | PCMPEQ 16(%rdi), %xmm1 |
277 | pmovmskb %xmm1, %eax |
278 | test %eax, %eax |
279 | jnz L(matches16_1) |
280 | xor %eax, %eax |
281 | ret |
282 | |
283 | .p2align 4 |
284 | L(matches0): |
285 | bsf %eax, %eax |
286 | lea -16(%rax, %rdi), %rax |
287 | ret |
288 | |
289 | .p2align 4 |
290 | L(matches): |
291 | bsf %eax, %eax |
292 | add %rdi, %rax |
293 | ret |
294 | |
295 | .p2align 4 |
296 | L(matches16): |
297 | bsf %eax, %eax |
298 | lea 16(%rax, %rdi), %rax |
299 | ret |
300 | |
301 | .p2align 4 |
302 | L(matches32): |
303 | bsf %eax, %eax |
304 | lea 32(%rax, %rdi), %rax |
305 | ret |
306 | |
307 | .p2align 4 |
308 | L(matches_1): |
309 | bsf %eax, %eax |
310 | #ifdef USE_AS_WMEMCHR |
311 | mov %eax, %esi |
312 | shr $2, %esi |
313 | sub %rsi, %rdx |
314 | #else |
315 | sub %rax, %rdx |
316 | #endif |
317 | jbe L(return_null) |
318 | add %rdi, %rax |
319 | ret |
320 | |
321 | .p2align 4 |
322 | L(matches16_1): |
323 | bsf %eax, %eax |
324 | #ifdef USE_AS_WMEMCHR |
325 | mov %eax, %esi |
326 | shr $2, %esi |
327 | sub %rsi, %rdx |
328 | #else |
329 | sub %rax, %rdx |
330 | #endif |
331 | jbe L(return_null) |
332 | lea 16(%rdi, %rax), %rax |
333 | ret |
334 | |
335 | .p2align 4 |
336 | L(matches32_1): |
337 | bsf %eax, %eax |
338 | #ifdef USE_AS_WMEMCHR |
339 | mov %eax, %esi |
340 | shr $2, %esi |
341 | sub %rsi, %rdx |
342 | #else |
343 | sub %rax, %rdx |
344 | #endif |
345 | jbe L(return_null) |
346 | lea 32(%rdi, %rax), %rax |
347 | ret |
348 | |
349 | .p2align 4 |
350 | L(matches48_1): |
351 | bsf %eax, %eax |
352 | #ifdef USE_AS_WMEMCHR |
353 | mov %eax, %esi |
354 | shr $2, %esi |
355 | sub %rsi, %rdx |
356 | #else |
357 | sub %rax, %rdx |
358 | #endif |
359 | jbe L(return_null) |
360 | lea 48(%rdi, %rax), %rax |
361 | ret |
362 | |
363 | .p2align 4 |
364 | L(return_null): |
365 | xor %eax, %eax |
366 | ret |
367 | END(MEMCHR) |
368 | |
369 | #ifndef USE_AS_WMEMCHR |
370 | strong_alias (memchr, __memchr) |
371 | libc_hidden_builtin_def(memchr) |
372 | #endif |
373 | |