1/* wcsrchr with SSE2, without using bsf instructions.
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20# include <sysdep.h>
21# define CFI_PUSH(REG) \
22 cfi_adjust_cfa_offset (4); \
23 cfi_rel_offset (REG, 0)
24
25# define CFI_POP(REG) \
26 cfi_adjust_cfa_offset (-4); \
27 cfi_restore (REG)
28
29# define PUSH(REG) pushl REG; CFI_PUSH (REG)
30# define POP(REG) popl REG; CFI_POP (REG)
31
32# define PARMS 8
33# define ENTRANCE PUSH (%edi);
34# define RETURN POP (%edi); ret; CFI_PUSH (%edi);
35# define STR1 PARMS
36# define STR2 STR1+4
37
38 atom_text_section
39ENTRY (__wcsrchr_sse2)
40
41 ENTRANCE
42 mov STR1(%esp), %ecx
43 movd STR2(%esp), %xmm1
44
45 mov %ecx, %edi
46 punpckldq %xmm1, %xmm1
47 pxor %xmm2, %xmm2
48 punpckldq %xmm1, %xmm1
49
50/* ECX has OFFSET. */
51 and $63, %ecx
52 cmp $48, %ecx
53 ja L(crosscache)
54
55/* unaligned string. */
56 movdqu (%edi), %xmm0
57 pcmpeqd %xmm0, %xmm2
58 pcmpeqd %xmm1, %xmm0
59/* Find where NULL is. */
60 pmovmskb %xmm2, %ecx
61/* Check if there is a match. */
62 pmovmskb %xmm0, %eax
63 add $16, %edi
64
65 test %eax, %eax
66 jnz L(unaligned_match1)
67
68 test %ecx, %ecx
69 jnz L(return_null)
70
71 and $-16, %edi
72
73 PUSH (%esi)
74
75 xor %edx, %edx
76 jmp L(loop)
77
78 CFI_POP (%esi)
79
80 .p2align 4
81L(unaligned_match1):
82 test %ecx, %ecx
83 jnz L(prolog_find_zero_1)
84
85 PUSH (%esi)
86
87/* Save current match */
88 mov %eax, %edx
89 mov %edi, %esi
90 and $-16, %edi
91 jmp L(loop)
92
93 CFI_POP (%esi)
94
95 .p2align 4
96L(crosscache):
97/* Hancle unaligned string. */
98 and $15, %ecx
99 and $-16, %edi
100 pxor %xmm3, %xmm3
101 movdqa (%edi), %xmm0
102 pcmpeqd %xmm0, %xmm3
103 pcmpeqd %xmm1, %xmm0
104/* Find where NULL is. */
105 pmovmskb %xmm3, %edx
106/* Check if there is a match. */
107 pmovmskb %xmm0, %eax
108/* Remove the leading bytes. */
109 shr %cl, %edx
110 shr %cl, %eax
111 add $16, %edi
112
113 test %eax, %eax
114 jnz L(unaligned_match)
115
116 test %edx, %edx
117 jnz L(return_null)
118
119 PUSH (%esi)
120
121 xor %edx, %edx
122 jmp L(loop)
123
124 CFI_POP (%esi)
125
126 .p2align 4
127L(unaligned_match):
128 test %edx, %edx
129 jnz L(prolog_find_zero)
130
131 PUSH (%esi)
132
133 mov %eax, %edx
134 lea (%edi, %ecx), %esi
135
136/* Loop start on aligned string. */
137 .p2align 4
138L(loop):
139 movdqa (%edi), %xmm0
140 pcmpeqd %xmm0, %xmm2
141 add $16, %edi
142 pcmpeqd %xmm1, %xmm0
143 pmovmskb %xmm2, %ecx
144 pmovmskb %xmm0, %eax
145 or %eax, %ecx
146 jnz L(matches)
147
148 movdqa (%edi), %xmm3
149 pcmpeqd %xmm3, %xmm2
150 add $16, %edi
151 pcmpeqd %xmm1, %xmm3
152 pmovmskb %xmm2, %ecx
153 pmovmskb %xmm3, %eax
154 or %eax, %ecx
155 jnz L(matches)
156
157 movdqa (%edi), %xmm4
158 pcmpeqd %xmm4, %xmm2
159 add $16, %edi
160 pcmpeqd %xmm1, %xmm4
161 pmovmskb %xmm2, %ecx
162 pmovmskb %xmm4, %eax
163 or %eax, %ecx
164 jnz L(matches)
165
166 movdqa (%edi), %xmm5
167 pcmpeqd %xmm5, %xmm2
168 add $16, %edi
169 pcmpeqd %xmm1, %xmm5
170 pmovmskb %xmm2, %ecx
171 pmovmskb %xmm5, %eax
172 or %eax, %ecx
173 jz L(loop)
174
175 .p2align 4
176L(matches):
177 test %eax, %eax
178 jnz L(match)
179L(return_value):
180 test %edx, %edx
181 jz L(return_null_1)
182 mov %edx, %eax
183 mov %esi, %edi
184
185 POP (%esi)
186
187 test %ah, %ah
188 jnz L(match_third_or_fourth_wchar)
189 test $15 << 4, %al
190 jnz L(match_second_wchar)
191 lea -16(%edi), %eax
192 RETURN
193
194 CFI_PUSH (%esi)
195
196 .p2align 4
197L(return_null_1):
198 POP (%esi)
199
200 xor %eax, %eax
201 RETURN
202
203 CFI_PUSH (%esi)
204
205 .p2align 4
206L(match):
207 pmovmskb %xmm2, %ecx
208 test %ecx, %ecx
209 jnz L(find_zero)
210/* save match info */
211 mov %eax, %edx
212 mov %edi, %esi
213 jmp L(loop)
214
215 .p2align 4
216L(find_zero):
217 test %cl, %cl
218 jz L(find_zero_in_third_or_fourth_wchar)
219 test $15, %cl
220 jz L(find_zero_in_second_wchar)
221 and $1, %eax
222 jz L(return_value)
223
224 POP (%esi)
225
226 lea -16(%edi), %eax
227 RETURN
228
229 CFI_PUSH (%esi)
230
231 .p2align 4
232L(find_zero_in_second_wchar):
233 and $1 << 5 - 1, %eax
234 jz L(return_value)
235
236 POP (%esi)
237
238 test $15 << 4, %al
239 jnz L(match_second_wchar)
240 lea -16(%edi), %eax
241 RETURN
242
243 CFI_PUSH (%esi)
244
245 .p2align 4
246L(find_zero_in_third_or_fourth_wchar):
247 test $15, %ch
248 jz L(find_zero_in_fourth_wchar)
249 and $1 << 9 - 1, %eax
250 jz L(return_value)
251
252 POP (%esi)
253
254 test %ah, %ah
255 jnz L(match_third_wchar)
256 test $15 << 4, %al
257 jnz L(match_second_wchar)
258 lea -16(%edi), %eax
259 RETURN
260
261 CFI_PUSH (%esi)
262
263 .p2align 4
264L(find_zero_in_fourth_wchar):
265
266 POP (%esi)
267
268 test %ah, %ah
269 jnz L(match_third_or_fourth_wchar)
270 test $15 << 4, %al
271 jnz L(match_second_wchar)
272 lea -16(%edi), %eax
273 RETURN
274
275 CFI_PUSH (%esi)
276
277 .p2align 4
278L(match_second_wchar):
279 lea -12(%edi), %eax
280 RETURN
281
282 .p2align 4
283L(match_third_or_fourth_wchar):
284 test $15 << 4, %ah
285 jnz L(match_fourth_wchar)
286 lea -8(%edi), %eax
287 RETURN
288
289 .p2align 4
290L(match_third_wchar):
291 lea -8(%edi), %eax
292 RETURN
293
294 .p2align 4
295L(match_fourth_wchar):
296 lea -4(%edi), %eax
297 RETURN
298
299 .p2align 4
300L(return_null):
301 xor %eax, %eax
302 RETURN
303
304 .p2align 4
305L(prolog_find_zero):
306 add %ecx, %edi
307 mov %edx, %ecx
308L(prolog_find_zero_1):
309 test %cl, %cl
310 jz L(prolog_find_zero_in_third_or_fourth_wchar)
311 test $15, %cl
312 jz L(prolog_find_zero_in_second_wchar)
313 and $1, %eax
314 jz L(return_null)
315
316 lea -16(%edi), %eax
317 RETURN
318
319 .p2align 4
320L(prolog_find_zero_in_second_wchar):
321 and $1 << 5 - 1, %eax
322 jz L(return_null)
323
324 test $15 << 4, %al
325 jnz L(match_second_wchar)
326 lea -16(%edi), %eax
327 RETURN
328
329 .p2align 4
330L(prolog_find_zero_in_third_or_fourth_wchar):
331 test $15, %ch
332 jz L(prolog_find_zero_in_fourth_wchar)
333 and $1 << 9 - 1, %eax
334 jz L(return_null)
335
336 test %ah, %ah
337 jnz L(match_third_wchar)
338 test $15 << 4, %al
339 jnz L(match_second_wchar)
340 lea -16(%edi), %eax
341 RETURN
342
343 .p2align 4
344L(prolog_find_zero_in_fourth_wchar):
345 test %ah, %ah
346 jnz L(match_third_or_fourth_wchar)
347 test $15 << 4, %al
348 jnz L(match_second_wchar)
349 lea -16(%edi), %eax
350 RETURN
351
352END (__wcsrchr_sse2)
353#endif
354

source code of glibc/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S