1 | /* Find character CH in a NUL terminated string. |
2 | Highly optimized version for ix85, x>=5. |
3 | Copyright (C) 1995-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | #include "asm-syntax.h" |
22 | |
23 | /* This version is especially optimized for the i586 (and following?) |
24 | processors. This is mainly done by using the two pipelines. The |
25 | version optimized for i486 is weak in this aspect because to get |
26 | as much parallelism we have to execute some *more* instructions. |
27 | |
28 | The code below is structured to reflect the pairing of the instructions |
29 | as *I think* it is. I have no processor data book to verify this. |
30 | If you find something you think is incorrect let me know. */ |
31 | |
32 | |
33 | /* The magic value which is used throughout in the whole code. */ |
34 | #define magic 0xfefefeff |
35 | |
36 | #define PARMS 4+16 /* space for 4 saved regs */ |
37 | #define RTN PARMS |
38 | #define STR RTN |
39 | #define CHR STR+4 |
40 | |
41 | .text |
42 | ENTRY (strchr) |
43 | |
44 | pushl %edi /* Save callee-safe registers. */ |
45 | cfi_adjust_cfa_offset (-4) |
46 | pushl %esi |
47 | cfi_adjust_cfa_offset (-4) |
48 | |
49 | pushl %ebx |
50 | cfi_adjust_cfa_offset (-4) |
51 | pushl %ebp |
52 | cfi_adjust_cfa_offset (-4) |
53 | |
54 | movl STR(%esp), %eax |
55 | movl CHR(%esp), %edx |
56 | |
57 | movl %eax, %edi /* duplicate string pointer for later */ |
58 | cfi_rel_offset (edi, 12) |
59 | xorl %ecx, %ecx /* clear %ecx */ |
60 | |
61 | /* At the moment %edx contains C. What we need for the |
62 | algorithm is C in all bytes of the dword. Avoid |
63 | operations on 16 bit words because these require an |
64 | prefix byte (and one more cycle). */ |
65 | movb %dl, %dh /* now it is 0|0|c|c */ |
66 | movb %dl, %cl /* we construct the lower half in %ecx */ |
67 | |
68 | shll $16, %edx /* now %edx is c|c|0|0 */ |
69 | movb %cl, %ch /* now %ecx is 0|0|c|c */ |
70 | |
71 | orl %ecx, %edx /* and finally c|c|c|c */ |
72 | andl $3, %edi /* mask alignment bits */ |
73 | |
74 | jz L(11) /* alignment is 0 => start loop */ |
75 | |
76 | movb %dl, %cl /* 0 is needed below */ |
77 | jp L(0) /* exactly two bits set */ |
78 | |
79 | xorb (%eax), %cl /* is byte the one we are looking for? */ |
80 | jz L(out) /* yes => return pointer */ |
81 | |
82 | xorb %dl, %cl /* load single byte and test for NUL */ |
83 | je L(3) /* yes => return NULL */ |
84 | |
85 | movb 1(%eax), %cl /* load single byte */ |
86 | incl %eax |
87 | |
88 | cmpb %cl, %dl /* is byte == C? */ |
89 | je L(out) /* aligned => return pointer */ |
90 | |
91 | cmpb $0, %cl /* is byte NUL? */ |
92 | je L(3) /* yes => return NULL */ |
93 | |
94 | incl %eax |
95 | decl %edi |
96 | |
97 | jne L(11) |
98 | |
99 | L(0): movb (%eax), %cl /* load single byte */ |
100 | |
101 | cmpb %cl, %dl /* is byte == C? */ |
102 | je L(out) /* aligned => return pointer */ |
103 | |
104 | cmpb $0, %cl /* is byte NUL? */ |
105 | je L(3) /* yes => return NULL */ |
106 | |
107 | incl %eax /* increment pointer */ |
108 | |
109 | cfi_rel_offset (esi, 8) |
110 | cfi_rel_offset (ebx, 4) |
111 | cfi_rel_offset (ebp, 0) |
112 | |
113 | /* The following code is the preparation for the loop. The |
114 | four instruction up to `L1' will not be executed in the loop |
115 | because the same code is found at the end of the loop, but |
116 | there it is executed in parallel with other instructions. */ |
117 | L(11): movl (%eax), %ecx |
118 | movl $magic, %ebp |
119 | |
120 | movl $magic, %edi |
121 | addl %ecx, %ebp |
122 | |
123 | /* The main loop: it looks complex and indeed it is. I would |
124 | love to say `it was hard to write, so it should he hard to |
125 | read' but I will give some more hints. To fully understand |
126 | this code you should first take a look at the i486 version. |
127 | The basic algorithm is the same, but here the code organized |
128 | in a way which permits to use both pipelines all the time. |
129 | |
130 | I tried to make it a bit more understandable by indenting |
131 | the code according to stage in the algorithm. It goes as |
132 | follows: |
133 | check for 0 in 1st word |
134 | check for C in 1st word |
135 | check for 0 in 2nd word |
136 | check for C in 2nd word |
137 | check for 0 in 3rd word |
138 | check for C in 3rd word |
139 | check for 0 in 4th word |
140 | check for C in 4th word |
141 | |
142 | Please note that doing the test for NUL before the test for |
143 | C allows us to overlap the test for 0 in the next word with |
144 | the test for C. */ |
145 | |
146 | L(1): xorl %ecx, %ebp /* (word^magic) */ |
147 | addl %ecx, %edi /* add magic word */ |
148 | |
149 | leal 4(%eax), %eax /* increment pointer */ |
150 | jnc L(4) /* previous addl caused overflow? */ |
151 | |
152 | movl %ecx, %ebx /* duplicate original word */ |
153 | orl $magic, %ebp /* (word^magic)|magic */ |
154 | |
155 | addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */ |
156 | jne L(4) /* yes => we found word with NUL */ |
157 | |
158 | movl $magic, %esi /* load magic value */ |
159 | xorl %edx, %ebx /* clear words which are C */ |
160 | |
161 | movl (%eax), %ecx |
162 | addl %ebx, %esi /* (word+magic) */ |
163 | |
164 | movl $magic, %edi |
165 | jnc L(5) /* previous addl caused overflow? */ |
166 | |
167 | movl %edi, %ebp |
168 | xorl %ebx, %esi /* (word+magic)^word */ |
169 | |
170 | addl %ecx, %ebp |
171 | orl $magic, %esi /* ((word+magic)^word)|magic */ |
172 | |
173 | addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/ |
174 | jne L(5) /* yes => we found word with C */ |
175 | |
176 | xorl %ecx, %ebp |
177 | addl %ecx, %edi |
178 | |
179 | leal 4(%eax), %eax |
180 | jnc L(4) |
181 | |
182 | movl %ecx, %ebx |
183 | orl $magic, %ebp |
184 | |
185 | addl $1, %ebp |
186 | jne L(4) |
187 | |
188 | movl $magic, %esi |
189 | xorl %edx, %ebx |
190 | |
191 | movl (%eax), %ecx |
192 | addl %ebx, %esi |
193 | |
194 | movl $magic, %edi |
195 | jnc L(5) |
196 | |
197 | movl %edi, %ebp |
198 | xorl %ebx, %esi |
199 | |
200 | addl %ecx, %ebp |
201 | orl $magic, %esi |
202 | |
203 | addl $1, %esi |
204 | jne L(5) |
205 | |
206 | xorl %ecx, %ebp |
207 | addl %ecx, %edi |
208 | |
209 | leal 4(%eax), %eax |
210 | jnc L(4) |
211 | |
212 | movl %ecx, %ebx |
213 | orl $magic, %ebp |
214 | |
215 | addl $1, %ebp |
216 | jne L(4) |
217 | |
218 | movl $magic, %esi |
219 | xorl %edx, %ebx |
220 | |
221 | movl (%eax), %ecx |
222 | addl %ebx, %esi |
223 | |
224 | movl $magic, %edi |
225 | jnc L(5) |
226 | |
227 | movl %edi, %ebp |
228 | xorl %ebx, %esi |
229 | |
230 | addl %ecx, %ebp |
231 | orl $magic, %esi |
232 | |
233 | addl $1, %esi |
234 | jne L(5) |
235 | |
236 | xorl %ecx, %ebp |
237 | addl %ecx, %edi |
238 | |
239 | leal 4(%eax), %eax |
240 | jnc L(4) |
241 | |
242 | movl %ecx, %ebx |
243 | orl $magic, %ebp |
244 | |
245 | addl $1, %ebp |
246 | jne L(4) |
247 | |
248 | movl $magic, %esi |
249 | xorl %edx, %ebx |
250 | |
251 | movl (%eax), %ecx |
252 | addl %ebx, %esi |
253 | |
254 | movl $magic, %edi |
255 | jnc L(5) |
256 | |
257 | movl %edi, %ebp |
258 | xorl %ebx, %esi |
259 | |
260 | addl %ecx, %ebp |
261 | orl $magic, %esi |
262 | |
263 | addl $1, %esi |
264 | |
265 | je L(1) |
266 | |
267 | /* We know there is no NUL byte but a C byte in the word. |
268 | %ebx contains NUL in this particular byte. */ |
269 | L(5): subl $4, %eax /* adjust pointer */ |
270 | testb %bl, %bl /* first byte == C? */ |
271 | |
272 | jz L(out) /* yes => return pointer */ |
273 | |
274 | incl %eax /* increment pointer */ |
275 | testb %bh, %bh /* second byte == C? */ |
276 | |
277 | jz L(out) /* yes => return pointer */ |
278 | |
279 | shrl $16, %ebx /* make upper bytes accessible */ |
280 | incl %eax /* increment pointer */ |
281 | |
282 | cmp $0, %bl /* third byte == C */ |
283 | je L(out) /* yes => return pointer */ |
284 | |
285 | incl %eax /* increment pointer */ |
286 | |
287 | L(out): popl %ebp /* restore saved registers */ |
288 | cfi_adjust_cfa_offset (-4) |
289 | cfi_restore (ebp) |
290 | popl %ebx |
291 | cfi_adjust_cfa_offset (-4) |
292 | cfi_restore (ebx) |
293 | |
294 | popl %esi |
295 | cfi_adjust_cfa_offset (-4) |
296 | cfi_restore (esi) |
297 | popl %edi |
298 | cfi_adjust_cfa_offset (-4) |
299 | cfi_restore (edi) |
300 | |
301 | ret |
302 | |
303 | cfi_adjust_cfa_offset (16) |
304 | cfi_rel_offset (edi, 12) |
305 | cfi_rel_offset (esi, 8) |
306 | cfi_rel_offset (ebx, 4) |
307 | cfi_rel_offset (ebp, 0) |
308 | /* We know there is a NUL byte in the word. But we have to test |
309 | whether there is an C byte before it in the word. */ |
310 | L(4): subl $4, %eax /* adjust pointer */ |
311 | cmpb %dl, %cl /* first byte == C? */ |
312 | |
313 | je L(out) /* yes => return pointer */ |
314 | |
315 | cmpb $0, %cl /* first byte == NUL? */ |
316 | je L(3) /* yes => return NULL */ |
317 | |
318 | incl %eax /* increment pointer */ |
319 | |
320 | cmpb %dl, %ch /* second byte == C? */ |
321 | je L(out) /* yes => return pointer */ |
322 | |
323 | cmpb $0, %ch /* second byte == NUL? */ |
324 | je L(3) /* yes => return NULL */ |
325 | |
326 | shrl $16, %ecx /* make upper bytes accessible */ |
327 | incl %eax /* increment pointer */ |
328 | |
329 | cmpb %dl, %cl /* third byte == C? */ |
330 | je L(out) /* yes => return pointer */ |
331 | |
332 | cmpb $0, %cl /* third byte == NUL? */ |
333 | je L(3) /* yes => return NULL */ |
334 | |
335 | incl %eax /* increment pointer */ |
336 | |
337 | /* The test four the fourth byte is necessary! */ |
338 | cmpb %dl, %ch /* fourth byte == C? */ |
339 | je L(out) /* yes => return pointer */ |
340 | |
341 | L(3): xorl %eax, %eax |
342 | jmp L(out) |
343 | END (strchr) |
344 | |
345 | #undef index |
346 | weak_alias (strchr, index) |
347 | libc_hidden_builtin_def (strchr) |
348 | |