1/* wcschr with SSE2, without using bsf instructions
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20# include <sysdep.h>
21
22# define CFI_PUSH(REG) \
23 cfi_adjust_cfa_offset (4); \
24 cfi_rel_offset (REG, 0)
25
26# define CFI_POP(REG) \
27 cfi_adjust_cfa_offset (-4); \
28 cfi_restore (REG)
29
30# define PUSH(REG) pushl REG; CFI_PUSH (REG)
31# define POP(REG) popl REG; CFI_POP (REG)
32
33# define PARMS 4
34# define STR1 PARMS
35# define STR2 STR1+4
36
37 atom_text_section
38ENTRY (__wcschr_sse2)
39
40 mov STR1(%esp), %ecx
41 movd STR2(%esp), %xmm1
42
43 mov %ecx, %eax
44 punpckldq %xmm1, %xmm1
45 pxor %xmm2, %xmm2
46 punpckldq %xmm1, %xmm1
47
48 and $63, %eax
49 cmp $48, %eax
50 ja L(cross_cache)
51
52 movdqu (%ecx), %xmm0
53 pcmpeqd %xmm0, %xmm2
54 pcmpeqd %xmm1, %xmm0
55 pmovmskb %xmm2, %edx
56 pmovmskb %xmm0, %eax
57 or %eax, %edx
58 jnz L(matches)
59 and $-16, %ecx
60 jmp L(loop)
61
62 .p2align 4
63L(cross_cache):
64 PUSH (%edi)
65 mov %ecx, %edi
66 mov %eax, %ecx
67 and $-16, %edi
68 and $15, %ecx
69 movdqa (%edi), %xmm0
70 pcmpeqd %xmm0, %xmm2
71 pcmpeqd %xmm1, %xmm0
72 pmovmskb %xmm2, %edx
73 pmovmskb %xmm0, %eax
74
75 sarl %cl, %edx
76 sarl %cl, %eax
77 test %eax, %eax
78 jz L(unaligned_no_match)
79
80 add %edi, %ecx
81 POP (%edi)
82
83 test %edx, %edx
84 jz L(match_case1)
85 test %al, %al
86 jz L(match_higth_case2)
87 test $15, %al
88 jnz L(match_case2_4)
89 test $15, %dl
90 jnz L(return_null)
91 lea 4(%ecx), %eax
92 ret
93
94 CFI_PUSH (%edi)
95
96 .p2align 4
97L(unaligned_no_match):
98 mov %edi, %ecx
99 POP (%edi)
100
101 test %edx, %edx
102 jnz L(return_null)
103
104 pxor %xmm2, %xmm2
105
106/* Loop start on aligned string. */
107 .p2align 4
108L(loop):
109 add $16, %ecx
110 movdqa (%ecx), %xmm0
111 pcmpeqd %xmm0, %xmm2
112 pcmpeqd %xmm1, %xmm0
113 pmovmskb %xmm2, %edx
114 pmovmskb %xmm0, %eax
115 or %eax, %edx
116 jnz L(matches)
117 add $16, %ecx
118
119 movdqa (%ecx), %xmm0
120 pcmpeqd %xmm0, %xmm2
121 pcmpeqd %xmm1, %xmm0
122 pmovmskb %xmm2, %edx
123 pmovmskb %xmm0, %eax
124 or %eax, %edx
125 jnz L(matches)
126 add $16, %ecx
127
128 movdqa (%ecx), %xmm0
129 pcmpeqd %xmm0, %xmm2
130 pcmpeqd %xmm1, %xmm0
131 pmovmskb %xmm2, %edx
132 pmovmskb %xmm0, %eax
133 or %eax, %edx
134 jnz L(matches)
135 add $16, %ecx
136
137 movdqa (%ecx), %xmm0
138 pcmpeqd %xmm0, %xmm2
139 pcmpeqd %xmm1, %xmm0
140 pmovmskb %xmm2, %edx
141 pmovmskb %xmm0, %eax
142 or %eax, %edx
143 jz L(loop)
144
145 .p2align 4
146L(matches):
147 pmovmskb %xmm2, %edx
148 test %eax, %eax
149 jz L(return_null)
150 test %edx, %edx
151 jz L(match_case1)
152
153 .p2align 4
154L(match_case2):
155 test %al, %al
156 jz L(match_higth_case2)
157 test $15, %al
158 jnz L(match_case2_4)
159 test $15, %dl
160 jnz L(return_null)
161 lea 4(%ecx), %eax
162 ret
163
164 .p2align 4
165L(match_case2_4):
166 mov %ecx, %eax
167 ret
168
169 .p2align 4
170L(match_higth_case2):
171 test %dl, %dl
172 jnz L(return_null)
173 test $15, %ah
174 jnz L(match_case2_12)
175 test $15, %dh
176 jnz L(return_null)
177 lea 12(%ecx), %eax
178 ret
179
180 .p2align 4
181L(match_case2_12):
182 lea 8(%ecx), %eax
183 ret
184
185 .p2align 4
186L(match_case1):
187 test %al, %al
188 jz L(match_higth_case1)
189
190 test $0x01, %al
191 jnz L(exit0)
192 lea 4(%ecx), %eax
193 ret
194
195 .p2align 4
196L(match_higth_case1):
197 test $0x01, %ah
198 jnz L(exit3)
199 lea 12(%ecx), %eax
200 ret
201
202 .p2align 4
203L(exit0):
204 mov %ecx, %eax
205 ret
206
207 .p2align 4
208L(exit3):
209 lea 8(%ecx), %eax
210 ret
211
212 .p2align 4
213L(return_null):
214 xor %eax, %eax
215 ret
216
217END (__wcschr_sse2)
218#endif
219

source code of glibc/sysdeps/i386/i686/multiarch/wcschr-sse2.S