1 | /* wcsrchr with SSE2, without using bsf instructions. |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | # include <sysdep.h> |
21 | # define CFI_PUSH(REG) \ |
22 | cfi_adjust_cfa_offset (4); \ |
23 | cfi_rel_offset (REG, 0) |
24 | |
25 | # define CFI_POP(REG) \ |
26 | cfi_adjust_cfa_offset (-4); \ |
27 | cfi_restore (REG) |
28 | |
29 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
30 | # define POP(REG) popl REG; CFI_POP (REG) |
31 | |
32 | # define PARMS 8 |
33 | # define ENTRANCE PUSH (%edi); |
34 | # define RETURN POP (%edi); ret; CFI_PUSH (%edi); |
35 | # define STR1 PARMS |
36 | # define STR2 STR1+4 |
37 | |
38 | atom_text_section |
39 | ENTRY (__wcsrchr_sse2) |
40 | |
41 | ENTRANCE |
42 | mov STR1(%esp), %ecx |
43 | movd STR2(%esp), %xmm1 |
44 | |
45 | mov %ecx, %edi |
46 | punpckldq %xmm1, %xmm1 |
47 | pxor %xmm2, %xmm2 |
48 | punpckldq %xmm1, %xmm1 |
49 | |
50 | /* ECX has OFFSET. */ |
51 | and $63, %ecx |
52 | cmp $48, %ecx |
53 | ja L(crosscache) |
54 | |
55 | /* unaligned string. */ |
56 | movdqu (%edi), %xmm0 |
57 | pcmpeqd %xmm0, %xmm2 |
58 | pcmpeqd %xmm1, %xmm0 |
59 | /* Find where NULL is. */ |
60 | pmovmskb %xmm2, %ecx |
61 | /* Check if there is a match. */ |
62 | pmovmskb %xmm0, %eax |
63 | add $16, %edi |
64 | |
65 | test %eax, %eax |
66 | jnz L(unaligned_match1) |
67 | |
68 | test %ecx, %ecx |
69 | jnz L(return_null) |
70 | |
71 | and $-16, %edi |
72 | |
73 | PUSH (%esi) |
74 | |
75 | xor %edx, %edx |
76 | jmp L(loop) |
77 | |
78 | CFI_POP (%esi) |
79 | |
80 | .p2align 4 |
81 | L(unaligned_match1): |
82 | test %ecx, %ecx |
83 | jnz L(prolog_find_zero_1) |
84 | |
85 | PUSH (%esi) |
86 | |
87 | /* Save current match */ |
88 | mov %eax, %edx |
89 | mov %edi, %esi |
90 | and $-16, %edi |
91 | jmp L(loop) |
92 | |
93 | CFI_POP (%esi) |
94 | |
95 | .p2align 4 |
96 | L(crosscache): |
97 | /* Hancle unaligned string. */ |
98 | and $15, %ecx |
99 | and $-16, %edi |
100 | pxor %xmm3, %xmm3 |
101 | movdqa (%edi), %xmm0 |
102 | pcmpeqd %xmm0, %xmm3 |
103 | pcmpeqd %xmm1, %xmm0 |
104 | /* Find where NULL is. */ |
105 | pmovmskb %xmm3, %edx |
106 | /* Check if there is a match. */ |
107 | pmovmskb %xmm0, %eax |
108 | /* Remove the leading bytes. */ |
109 | shr %cl, %edx |
110 | shr %cl, %eax |
111 | add $16, %edi |
112 | |
113 | test %eax, %eax |
114 | jnz L(unaligned_match) |
115 | |
116 | test %edx, %edx |
117 | jnz L(return_null) |
118 | |
119 | PUSH (%esi) |
120 | |
121 | xor %edx, %edx |
122 | jmp L(loop) |
123 | |
124 | CFI_POP (%esi) |
125 | |
126 | .p2align 4 |
127 | L(unaligned_match): |
128 | test %edx, %edx |
129 | jnz L(prolog_find_zero) |
130 | |
131 | PUSH (%esi) |
132 | |
133 | mov %eax, %edx |
134 | lea (%edi, %ecx), %esi |
135 | |
136 | /* Loop start on aligned string. */ |
137 | .p2align 4 |
138 | L(loop): |
139 | movdqa (%edi), %xmm0 |
140 | pcmpeqd %xmm0, %xmm2 |
141 | add $16, %edi |
142 | pcmpeqd %xmm1, %xmm0 |
143 | pmovmskb %xmm2, %ecx |
144 | pmovmskb %xmm0, %eax |
145 | or %eax, %ecx |
146 | jnz L(matches) |
147 | |
148 | movdqa (%edi), %xmm3 |
149 | pcmpeqd %xmm3, %xmm2 |
150 | add $16, %edi |
151 | pcmpeqd %xmm1, %xmm3 |
152 | pmovmskb %xmm2, %ecx |
153 | pmovmskb %xmm3, %eax |
154 | or %eax, %ecx |
155 | jnz L(matches) |
156 | |
157 | movdqa (%edi), %xmm4 |
158 | pcmpeqd %xmm4, %xmm2 |
159 | add $16, %edi |
160 | pcmpeqd %xmm1, %xmm4 |
161 | pmovmskb %xmm2, %ecx |
162 | pmovmskb %xmm4, %eax |
163 | or %eax, %ecx |
164 | jnz L(matches) |
165 | |
166 | movdqa (%edi), %xmm5 |
167 | pcmpeqd %xmm5, %xmm2 |
168 | add $16, %edi |
169 | pcmpeqd %xmm1, %xmm5 |
170 | pmovmskb %xmm2, %ecx |
171 | pmovmskb %xmm5, %eax |
172 | or %eax, %ecx |
173 | jz L(loop) |
174 | |
175 | .p2align 4 |
176 | L(matches): |
177 | test %eax, %eax |
178 | jnz L(match) |
179 | L(return_value): |
180 | test %edx, %edx |
181 | jz L(return_null_1) |
182 | mov %edx, %eax |
183 | mov %esi, %edi |
184 | |
185 | POP (%esi) |
186 | |
187 | test %ah, %ah |
188 | jnz L(match_third_or_fourth_wchar) |
189 | test $15 << 4, %al |
190 | jnz L(match_second_wchar) |
191 | lea -16(%edi), %eax |
192 | RETURN |
193 | |
194 | CFI_PUSH (%esi) |
195 | |
196 | .p2align 4 |
197 | L(return_null_1): |
198 | POP (%esi) |
199 | |
200 | xor %eax, %eax |
201 | RETURN |
202 | |
203 | CFI_PUSH (%esi) |
204 | |
205 | .p2align 4 |
206 | L(match): |
207 | pmovmskb %xmm2, %ecx |
208 | test %ecx, %ecx |
209 | jnz L(find_zero) |
210 | /* save match info */ |
211 | mov %eax, %edx |
212 | mov %edi, %esi |
213 | jmp L(loop) |
214 | |
215 | .p2align 4 |
216 | L(find_zero): |
217 | test %cl, %cl |
218 | jz L(find_zero_in_third_or_fourth_wchar) |
219 | test $15, %cl |
220 | jz L(find_zero_in_second_wchar) |
221 | and $1, %eax |
222 | jz L(return_value) |
223 | |
224 | POP (%esi) |
225 | |
226 | lea -16(%edi), %eax |
227 | RETURN |
228 | |
229 | CFI_PUSH (%esi) |
230 | |
231 | .p2align 4 |
232 | L(find_zero_in_second_wchar): |
233 | and $1 << 5 - 1, %eax |
234 | jz L(return_value) |
235 | |
236 | POP (%esi) |
237 | |
238 | test $15 << 4, %al |
239 | jnz L(match_second_wchar) |
240 | lea -16(%edi), %eax |
241 | RETURN |
242 | |
243 | CFI_PUSH (%esi) |
244 | |
245 | .p2align 4 |
246 | L(find_zero_in_third_or_fourth_wchar): |
247 | test $15, %ch |
248 | jz L(find_zero_in_fourth_wchar) |
249 | and $1 << 9 - 1, %eax |
250 | jz L(return_value) |
251 | |
252 | POP (%esi) |
253 | |
254 | test %ah, %ah |
255 | jnz L(match_third_wchar) |
256 | test $15 << 4, %al |
257 | jnz L(match_second_wchar) |
258 | lea -16(%edi), %eax |
259 | RETURN |
260 | |
261 | CFI_PUSH (%esi) |
262 | |
263 | .p2align 4 |
264 | L(find_zero_in_fourth_wchar): |
265 | |
266 | POP (%esi) |
267 | |
268 | test %ah, %ah |
269 | jnz L(match_third_or_fourth_wchar) |
270 | test $15 << 4, %al |
271 | jnz L(match_second_wchar) |
272 | lea -16(%edi), %eax |
273 | RETURN |
274 | |
275 | CFI_PUSH (%esi) |
276 | |
277 | .p2align 4 |
278 | L(match_second_wchar): |
279 | lea -12(%edi), %eax |
280 | RETURN |
281 | |
282 | .p2align 4 |
283 | L(match_third_or_fourth_wchar): |
284 | test $15 << 4, %ah |
285 | jnz L(match_fourth_wchar) |
286 | lea -8(%edi), %eax |
287 | RETURN |
288 | |
289 | .p2align 4 |
290 | L(match_third_wchar): |
291 | lea -8(%edi), %eax |
292 | RETURN |
293 | |
294 | .p2align 4 |
295 | L(match_fourth_wchar): |
296 | lea -4(%edi), %eax |
297 | RETURN |
298 | |
299 | .p2align 4 |
300 | L(return_null): |
301 | xor %eax, %eax |
302 | RETURN |
303 | |
304 | .p2align 4 |
305 | L(prolog_find_zero): |
306 | add %ecx, %edi |
307 | mov %edx, %ecx |
308 | L(prolog_find_zero_1): |
309 | test %cl, %cl |
310 | jz L(prolog_find_zero_in_third_or_fourth_wchar) |
311 | test $15, %cl |
312 | jz L(prolog_find_zero_in_second_wchar) |
313 | and $1, %eax |
314 | jz L(return_null) |
315 | |
316 | lea -16(%edi), %eax |
317 | RETURN |
318 | |
319 | .p2align 4 |
320 | L(prolog_find_zero_in_second_wchar): |
321 | and $1 << 5 - 1, %eax |
322 | jz L(return_null) |
323 | |
324 | test $15 << 4, %al |
325 | jnz L(match_second_wchar) |
326 | lea -16(%edi), %eax |
327 | RETURN |
328 | |
329 | .p2align 4 |
330 | L(prolog_find_zero_in_third_or_fourth_wchar): |
331 | test $15, %ch |
332 | jz L(prolog_find_zero_in_fourth_wchar) |
333 | and $1 << 9 - 1, %eax |
334 | jz L(return_null) |
335 | |
336 | test %ah, %ah |
337 | jnz L(match_third_wchar) |
338 | test $15 << 4, %al |
339 | jnz L(match_second_wchar) |
340 | lea -16(%edi), %eax |
341 | RETURN |
342 | |
343 | .p2align 4 |
344 | L(prolog_find_zero_in_fourth_wchar): |
345 | test %ah, %ah |
346 | jnz L(match_third_or_fourth_wchar) |
347 | test $15 << 4, %al |
348 | jnz L(match_second_wchar) |
349 | lea -16(%edi), %eax |
350 | RETURN |
351 | |
352 | END (__wcsrchr_sse2) |
353 | #endif |
354 | |