1/* Find character CH in a NUL terminated string.
2 Highly optimized version for ix85, x>=5.
3 Copyright (C) 1995-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21#include "asm-syntax.h"
22
23/* This version is especially optimized for the i586 (and following?)
24 processors. This is mainly done by using the two pipelines. The
25 version optimized for i486 is weak in this aspect because to get
26 as much parallelism we have to execute some *more* instructions.
27
28 The code below is structured to reflect the pairing of the instructions
29 as *I think* it is. I have no processor data book to verify this.
30 If you find something you think is incorrect let me know. */
31
32
33/* The magic value which is used throughout in the whole code. */
34#define magic 0xfefefeff
35
36#define PARMS 4+16 /* space for 4 saved regs */
37#define RTN PARMS
38#define STR RTN
39#define CHR STR+4
40
41 .text
42ENTRY (strchr)
43
44 pushl %edi /* Save callee-safe registers. */
45 cfi_adjust_cfa_offset (-4)
46 pushl %esi
47 cfi_adjust_cfa_offset (-4)
48
49 pushl %ebx
50 cfi_adjust_cfa_offset (-4)
51 pushl %ebp
52 cfi_adjust_cfa_offset (-4)
53
54 movl STR(%esp), %eax
55 movl CHR(%esp), %edx
56
57 movl %eax, %edi /* duplicate string pointer for later */
58 cfi_rel_offset (edi, 12)
59 xorl %ecx, %ecx /* clear %ecx */
60
61 /* At the moment %edx contains C. What we need for the
62 algorithm is C in all bytes of the dword. Avoid
63 operations on 16 bit words because these require an
64 prefix byte (and one more cycle). */
65 movb %dl, %dh /* now it is 0|0|c|c */
66 movb %dl, %cl /* we construct the lower half in %ecx */
67
68 shll $16, %edx /* now %edx is c|c|0|0 */
69 movb %cl, %ch /* now %ecx is 0|0|c|c */
70
71 orl %ecx, %edx /* and finally c|c|c|c */
72 andl $3, %edi /* mask alignment bits */
73
74 jz L(11) /* alignment is 0 => start loop */
75
76 movb %dl, %cl /* 0 is needed below */
77 jp L(0) /* exactly two bits set */
78
79 xorb (%eax), %cl /* is byte the one we are looking for? */
80 jz L(out) /* yes => return pointer */
81
82 xorb %dl, %cl /* load single byte and test for NUL */
83 je L(3) /* yes => return NULL */
84
85 movb 1(%eax), %cl /* load single byte */
86 incl %eax
87
88 cmpb %cl, %dl /* is byte == C? */
89 je L(out) /* aligned => return pointer */
90
91 cmpb $0, %cl /* is byte NUL? */
92 je L(3) /* yes => return NULL */
93
94 incl %eax
95 decl %edi
96
97 jne L(11)
98
99L(0): movb (%eax), %cl /* load single byte */
100
101 cmpb %cl, %dl /* is byte == C? */
102 je L(out) /* aligned => return pointer */
103
104 cmpb $0, %cl /* is byte NUL? */
105 je L(3) /* yes => return NULL */
106
107 incl %eax /* increment pointer */
108
109 cfi_rel_offset (esi, 8)
110 cfi_rel_offset (ebx, 4)
111 cfi_rel_offset (ebp, 0)
112
113 /* The following code is the preparation for the loop. The
114 four instruction up to `L1' will not be executed in the loop
115 because the same code is found at the end of the loop, but
116 there it is executed in parallel with other instructions. */
117L(11): movl (%eax), %ecx
118 movl $magic, %ebp
119
120 movl $magic, %edi
121 addl %ecx, %ebp
122
123 /* The main loop: it looks complex and indeed it is. I would
124 love to say `it was hard to write, so it should he hard to
125 read' but I will give some more hints. To fully understand
126 this code you should first take a look at the i486 version.
127 The basic algorithm is the same, but here the code organized
128 in a way which permits to use both pipelines all the time.
129
130 I tried to make it a bit more understandable by indenting
131 the code according to stage in the algorithm. It goes as
132 follows:
133 check for 0 in 1st word
134 check for C in 1st word
135 check for 0 in 2nd word
136 check for C in 2nd word
137 check for 0 in 3rd word
138 check for C in 3rd word
139 check for 0 in 4th word
140 check for C in 4th word
141
142 Please note that doing the test for NUL before the test for
143 C allows us to overlap the test for 0 in the next word with
144 the test for C. */
145
146L(1): xorl %ecx, %ebp /* (word^magic) */
147 addl %ecx, %edi /* add magic word */
148
149 leal 4(%eax), %eax /* increment pointer */
150 jnc L(4) /* previous addl caused overflow? */
151
152 movl %ecx, %ebx /* duplicate original word */
153 orl $magic, %ebp /* (word^magic)|magic */
154
155 addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */
156 jne L(4) /* yes => we found word with NUL */
157
158 movl $magic, %esi /* load magic value */
159 xorl %edx, %ebx /* clear words which are C */
160
161 movl (%eax), %ecx
162 addl %ebx, %esi /* (word+magic) */
163
164 movl $magic, %edi
165 jnc L(5) /* previous addl caused overflow? */
166
167 movl %edi, %ebp
168 xorl %ebx, %esi /* (word+magic)^word */
169
170 addl %ecx, %ebp
171 orl $magic, %esi /* ((word+magic)^word)|magic */
172
173 addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/
174 jne L(5) /* yes => we found word with C */
175
176 xorl %ecx, %ebp
177 addl %ecx, %edi
178
179 leal 4(%eax), %eax
180 jnc L(4)
181
182 movl %ecx, %ebx
183 orl $magic, %ebp
184
185 addl $1, %ebp
186 jne L(4)
187
188 movl $magic, %esi
189 xorl %edx, %ebx
190
191 movl (%eax), %ecx
192 addl %ebx, %esi
193
194 movl $magic, %edi
195 jnc L(5)
196
197 movl %edi, %ebp
198 xorl %ebx, %esi
199
200 addl %ecx, %ebp
201 orl $magic, %esi
202
203 addl $1, %esi
204 jne L(5)
205
206 xorl %ecx, %ebp
207 addl %ecx, %edi
208
209 leal 4(%eax), %eax
210 jnc L(4)
211
212 movl %ecx, %ebx
213 orl $magic, %ebp
214
215 addl $1, %ebp
216 jne L(4)
217
218 movl $magic, %esi
219 xorl %edx, %ebx
220
221 movl (%eax), %ecx
222 addl %ebx, %esi
223
224 movl $magic, %edi
225 jnc L(5)
226
227 movl %edi, %ebp
228 xorl %ebx, %esi
229
230 addl %ecx, %ebp
231 orl $magic, %esi
232
233 addl $1, %esi
234 jne L(5)
235
236 xorl %ecx, %ebp
237 addl %ecx, %edi
238
239 leal 4(%eax), %eax
240 jnc L(4)
241
242 movl %ecx, %ebx
243 orl $magic, %ebp
244
245 addl $1, %ebp
246 jne L(4)
247
248 movl $magic, %esi
249 xorl %edx, %ebx
250
251 movl (%eax), %ecx
252 addl %ebx, %esi
253
254 movl $magic, %edi
255 jnc L(5)
256
257 movl %edi, %ebp
258 xorl %ebx, %esi
259
260 addl %ecx, %ebp
261 orl $magic, %esi
262
263 addl $1, %esi
264
265 je L(1)
266
267 /* We know there is no NUL byte but a C byte in the word.
268 %ebx contains NUL in this particular byte. */
269L(5): subl $4, %eax /* adjust pointer */
270 testb %bl, %bl /* first byte == C? */
271
272 jz L(out) /* yes => return pointer */
273
274 incl %eax /* increment pointer */
275 testb %bh, %bh /* second byte == C? */
276
277 jz L(out) /* yes => return pointer */
278
279 shrl $16, %ebx /* make upper bytes accessible */
280 incl %eax /* increment pointer */
281
282 cmp $0, %bl /* third byte == C */
283 je L(out) /* yes => return pointer */
284
285 incl %eax /* increment pointer */
286
287L(out): popl %ebp /* restore saved registers */
288 cfi_adjust_cfa_offset (-4)
289 cfi_restore (ebp)
290 popl %ebx
291 cfi_adjust_cfa_offset (-4)
292 cfi_restore (ebx)
293
294 popl %esi
295 cfi_adjust_cfa_offset (-4)
296 cfi_restore (esi)
297 popl %edi
298 cfi_adjust_cfa_offset (-4)
299 cfi_restore (edi)
300
301 ret
302
303 cfi_adjust_cfa_offset (16)
304 cfi_rel_offset (edi, 12)
305 cfi_rel_offset (esi, 8)
306 cfi_rel_offset (ebx, 4)
307 cfi_rel_offset (ebp, 0)
308 /* We know there is a NUL byte in the word. But we have to test
309 whether there is an C byte before it in the word. */
310L(4): subl $4, %eax /* adjust pointer */
311 cmpb %dl, %cl /* first byte == C? */
312
313 je L(out) /* yes => return pointer */
314
315 cmpb $0, %cl /* first byte == NUL? */
316 je L(3) /* yes => return NULL */
317
318 incl %eax /* increment pointer */
319
320 cmpb %dl, %ch /* second byte == C? */
321 je L(out) /* yes => return pointer */
322
323 cmpb $0, %ch /* second byte == NUL? */
324 je L(3) /* yes => return NULL */
325
326 shrl $16, %ecx /* make upper bytes accessible */
327 incl %eax /* increment pointer */
328
329 cmpb %dl, %cl /* third byte == C? */
330 je L(out) /* yes => return pointer */
331
332 cmpb $0, %cl /* third byte == NUL? */
333 je L(3) /* yes => return NULL */
334
335 incl %eax /* increment pointer */
336
337 /* The test four the fourth byte is necessary! */
338 cmpb %dl, %ch /* fourth byte == C? */
339 je L(out) /* yes => return pointer */
340
341L(3): xorl %eax, %eax
342 jmp L(out)
343END (strchr)
344
345#undef index
346weak_alias (strchr, index)
347libc_hidden_builtin_def (strchr)
348

source code of glibc/sysdeps/i386/i586/strchr.S