strchr.S source code [glibc/sysdeps/i386/i586/strchr.S]

1	/ Find character CH in a NUL terminated string.*
2	Highly optimized version for ix85, x>=5.
3	Copyright (C) 1995-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21	#include "asm-syntax.h"
22
23	/ This version is especially optimized for the i586 (and following?)*
24	processors. This is mainly done by using the two pipelines. The
25	version optimized for i486 is weak in this aspect because to get
26	as much parallelism we have to execute some more* instructions.*
27
28	The code below is structured to reflect the pairing of the instructions
29	as I think* it is. I have no processor data book to verify this.*
30	If you find something you think is incorrect let me know. /*
31
32
33	/ The magic value which is used throughout in the whole code. /
34	#define magic 0xfefefeff
35
36	#define PARMS 4+16 /* space for 4 saved regs */
37	#define RTN PARMS
38	#define STR RTN
39	#define CHR STR+4
40
41	.text
42	ENTRY (strchr)
43
44	pushl %edi / Save callee-safe registers. /
45	cfi_adjust_cfa_offset (-`4`)
46	pushl %esi
47	cfi_adjust_cfa_offset (-`4`)
48
49	pushl %ebx
50	cfi_adjust_cfa_offset (-`4`)
51	pushl %ebp
52	cfi_adjust_cfa_offset (-`4`)
53
54	movl STR(%esp), %eax
55	movl CHR(%esp), %edx
56
57	movl %eax, %edi / duplicate string pointer for later /
58	cfi_rel_offset (edi, `12`)
59	xorl %ecx, %ecx / clear %ecx /
60
61	/ At the moment %edx contains C. What we need for the*
62	algorithm is C in all bytes of the dword. Avoid
63	operations on 16 bit words because these require an
64	prefix byte (and one more cycle). /*
65	movb %dl, %dh / now it is 0\|0\|c\|c /
66	movb %dl, %cl / we construct the lower half in %ecx /
67
68	shll $`16`, %edx / now %edx is c\|c\|0\|0 /
69	movb %cl, %ch / now %ecx is 0\|0\|c\|c /
70
71	orl %ecx, %edx / and finally c\|c\|c\|c /
72	andl $`3`, %edi / mask alignment bits /
73
74	jz L(`11`) / alignment is 0 => start loop /
75
76	movb %dl, %cl / 0 is needed below /
77	jp L(`0`) / exactly two bits set /
78
79	xorb (%eax), %cl / is byte the one we are looking for? /
80	jz L(out) / yes => return pointer /
81
82	xorb %dl, %cl / load single byte and test for NUL /
83	je L(`3`) / yes => return NULL /
84
85	movb `1`(%eax), %cl / load single byte /
86	incl %eax
87
88	cmpb %cl, %dl / is byte == C? /
89	je L(out) / aligned => return pointer /
90
91	cmpb $`0`, %cl / is byte NUL? /
92	je L(`3`) / yes => return NULL /
93
94	incl %eax
95	decl %edi
96
97	jne L(`11`)
98
99	L(`0`): movb (%eax), %cl / load single byte /
100
101	cmpb %cl, %dl / is byte == C? /
102	je L(out) / aligned => return pointer /
103
104	cmpb $`0`, %cl / is byte NUL? /
105	je L(`3`) / yes => return NULL /
106
107	incl %eax / increment pointer /
108
109	cfi_rel_offset (esi, `8`)
110	cfi_rel_offset (ebx, `4`)
111	cfi_rel_offset (ebp, `0`)
112
113	/ The following code is the preparation for the loop. The*
114	four instruction up to `L1' will not be executed in the loop
115	because the same code is found at the end of the loop, but
116	there it is executed in parallel with other instructions. /*
117	L(`11`): movl (%eax), %ecx
118	movl $magic, %ebp
119
120	movl $magic, %edi
121	addl %ecx, %ebp
122
123	/ The main loop: it looks complex and indeed it is. I would*
124	love to say `it was hard to write, so it should he hard to
125	read' but I will give some more hints. To fully understand
126	this code you should first take a look at the i486 version.
127	The basic algorithm is the same, but here the code organized
128	in a way which permits to use both pipelines all the time.
129
130	I tried to make it a bit more understandable by indenting
131	the code according to stage in the algorithm. It goes as
132	follows:
133	check for 0 in 1st word
134	check for C in 1st word
135	check for 0 in 2nd word
136	check for C in 2nd word
137	check for 0 in 3rd word
138	check for C in 3rd word
139	check for 0 in 4th word
140	check for C in 4th word
141
142	Please note that doing the test for NUL before the test for
143	C allows us to overlap the test for 0 in the next word with
144	the test for C. /*
145
146	L(`1`): xorl %ecx, %ebp / (word^magic) /
147	addl %ecx, %edi / add magic word /
148
149	leal `4`(%eax), %eax / increment pointer /
150	jnc L(`4`) / previous addl caused overflow? /
151
152	movl %ecx, %ebx / duplicate original word /
153	orl $magic, %ebp / (word^magic)\|magic /
154
155	addl $`1`, %ebp / (word^magic)\|magic == 0xffffffff? /
156	jne L(`4`) / yes => we found word with NUL /
157
158	movl $magic, %esi / load magic value /
159	xorl %edx, %ebx / clear words which are C /
160
161	movl (%eax), %ecx
162	addl %ebx, %esi / (word+magic) /
163
164	movl $magic, %edi
165	jnc L(`5`) / previous addl caused overflow? /
166
167	movl %edi, %ebp
168	xorl %ebx, %esi / (word+magic)^word /
169
170	addl %ecx, %ebp
171	orl $magic, %esi / ((word+magic)^word)\|magic /
172
173	addl $`1`, %esi / ((word+magic)^word)\|magic==0xf..f?/
174	jne L(`5`) / yes => we found word with C /
175
176	xorl %ecx, %ebp
177	addl %ecx, %edi
178
179	leal `4`(%eax), %eax
180	jnc L(`4`)
181
182	movl %ecx, %ebx
183	orl $magic, %ebp
184
185	addl $`1`, %ebp
186	jne L(`4`)
187
188	movl $magic, %esi
189	xorl %edx, %ebx
190
191	movl (%eax), %ecx
192	addl %ebx, %esi
193
194	movl $magic, %edi
195	jnc L(`5`)
196
197	movl %edi, %ebp
198	xorl %ebx, %esi
199
200	addl %ecx, %ebp
201	orl $magic, %esi
202
203	addl $`1`, %esi
204	jne L(`5`)
205
206	xorl %ecx, %ebp
207	addl %ecx, %edi
208
209	leal `4`(%eax), %eax
210	jnc L(`4`)
211
212	movl %ecx, %ebx
213	orl $magic, %ebp
214
215	addl $`1`, %ebp
216	jne L(`4`)
217
218	movl $magic, %esi
219	xorl %edx, %ebx
220
221	movl (%eax), %ecx
222	addl %ebx, %esi
223
224	movl $magic, %edi
225	jnc L(`5`)
226
227	movl %edi, %ebp
228	xorl %ebx, %esi
229
230	addl %ecx, %ebp
231	orl $magic, %esi
232
233	addl $`1`, %esi
234	jne L(`5`)
235
236	xorl %ecx, %ebp
237	addl %ecx, %edi
238
239	leal `4`(%eax), %eax
240	jnc L(`4`)
241
242	movl %ecx, %ebx
243	orl $magic, %ebp
244
245	addl $`1`, %ebp
246	jne L(`4`)
247
248	movl $magic, %esi
249	xorl %edx, %ebx
250
251	movl (%eax), %ecx
252	addl %ebx, %esi
253
254	movl $magic, %edi
255	jnc L(`5`)
256
257	movl %edi, %ebp
258	xorl %ebx, %esi
259
260	addl %ecx, %ebp
261	orl $magic, %esi
262
263	addl $`1`, %esi
264
265	je L(`1`)
266
267	/ We know there is no NUL byte but a C byte in the word.*
268	%ebx contains NUL in this particular byte. /*
269	L(`5`): subl $`4`, %eax / adjust pointer /
270	testb %bl, %bl / first byte == C? /
271
272	jz L(out) / yes => return pointer /
273
274	incl %eax / increment pointer /
275	testb %bh, %bh / second byte == C? /
276
277	jz L(out) / yes => return pointer /
278
279	shrl $`16`, %ebx / make upper bytes accessible /
280	incl %eax / increment pointer /
281
282	cmp $`0`, %bl / third byte == C /
283	je L(out) / yes => return pointer /
284
285	incl %eax / increment pointer /
286
287	L(out): popl %ebp / restore saved registers /
288	cfi_adjust_cfa_offset (-`4`)
289	cfi_restore (ebp)
290	popl %ebx
291	cfi_adjust_cfa_offset (-`4`)
292	cfi_restore (ebx)
293
294	popl %esi
295	cfi_adjust_cfa_offset (-`4`)
296	cfi_restore (esi)
297	popl %edi
298	cfi_adjust_cfa_offset (-`4`)
299	cfi_restore (edi)
300
301	ret
302
303	cfi_adjust_cfa_offset (`16`)
304	cfi_rel_offset (edi, `12`)
305	cfi_rel_offset (esi, `8`)
306	cfi_rel_offset (ebx, `4`)
307	cfi_rel_offset (ebp, `0`)
308	/ We know there is a NUL byte in the word. But we have to test*
309	whether there is an C byte before it in the word. /*
310	L(`4`): subl $`4`, %eax / adjust pointer /
311	cmpb %dl, %cl / first byte == C? /
312
313	je L(out) / yes => return pointer /
314
315	cmpb $`0`, %cl / first byte == NUL? /
316	je L(`3`) / yes => return NULL /
317
318	incl %eax / increment pointer /
319
320	cmpb %dl, %ch / second byte == C? /
321	je L(out) / yes => return pointer /
322
323	cmpb $`0`, %ch / second byte == NUL? /
324	je L(`3`) / yes => return NULL /
325
326	shrl $`16`, %ecx / make upper bytes accessible /
327	incl %eax / increment pointer /
328
329	cmpb %dl, %cl / third byte == C? /
330	je L(out) / yes => return pointer /
331
332	cmpb $`0`, %cl / third byte == NUL? /
333	je L(`3`) / yes => return NULL /
334
335	incl %eax / increment pointer /
336
337	/ The test four the fourth byte is necessary! /
338	cmpb %dl, %ch / fourth byte == C? /
339	je L(out) / yes => return pointer /
340
341	L(`3`): xorl %eax, %eax
342	jmp L(out)
343	END (strchr)
344
345	#undef index
346	weak_alias (strchr, index)
347	libc_hidden_builtin_def (strchr)
348

source code of glibc/sysdeps/i386/i586/strchr.S