1 | /* strlen -- Compute length of NUL terminated string. |
2 | Highly optimized version for ix86, x>=5. |
3 | Copyright (C) 1995-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | #include "asm-syntax.h" |
22 | |
23 | /* This version is especially optimized for the i586 (and following?) |
24 | processors. This is mainly done by using the two pipelines. The |
25 | version optimized for i486 is weak in this aspect because to get |
26 | as much parallelism we have to execute some *more* instructions. |
27 | |
28 | The code below is structured to reflect the pairing of the instructions |
29 | as *I think* it is. I have no processor data book to verify this. |
30 | If you find something you think is incorrect let me know. */ |
31 | |
32 | |
33 | /* The magic value which is used throughout in the whole code. */ |
34 | #define magic 0xfefefeff |
35 | |
36 | #define PARMS 4 /* no space for saved regs */ |
37 | #define STR PARMS |
38 | |
39 | .text |
40 | ENTRY (strlen) |
41 | |
42 | movl STR(%esp), %eax |
43 | movl $3, %edx /* load mask (= 3) */ |
44 | |
45 | andl %eax, %edx /* separate last two bits of address */ |
46 | |
47 | jz L(1) /* aligned => start loop */ |
48 | jp L(0) /* exactly two bits set */ |
49 | |
50 | cmpb %dh, (%eax) /* is byte NUL? */ |
51 | je L(2) /* yes => return */ |
52 | |
53 | incl %eax /* increment pointer */ |
54 | cmpb %dh, (%eax) /* is byte NUL? */ |
55 | |
56 | je L(2) /* yes => return */ |
57 | |
58 | incl %eax /* increment pointer */ |
59 | xorl $2, %edx |
60 | |
61 | jz L(1) |
62 | |
63 | L(0): cmpb %dh, (%eax) /* is byte NUL? */ |
64 | je L(2) /* yes => return */ |
65 | |
66 | incl %eax /* increment pointer */ |
67 | xorl %edx, %edx /* We need %edx == 0 for later */ |
68 | |
69 | /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to |
70 | change any of the hole bits of LONGWORD. |
71 | |
72 | 1) Is this safe? Will it catch all the zero bytes? |
73 | Suppose there is a byte with all zeros. Any carry bits |
74 | propagating from its left will fall into the hole at its |
75 | least significant bit and stop. Since there will be no |
76 | carry from its most significant bit, the LSB of the |
77 | byte to the left will be unchanged, and the zero will be |
78 | detected. |
79 | |
80 | 2) Is this worthwhile? Will it ignore everything except |
81 | zero bytes? Suppose every byte of LONGWORD has a bit set |
82 | somewhere. There will be a carry into bit 8. If bit 8 |
83 | is set, this will carry into bit 16. If bit 8 is clear, |
84 | one of bits 9-15 must be set, so there will be a carry |
85 | into bit 16. Similarly, there will be a carry into bit |
86 | 24. If one of bits 24-31 is set, there will be a carry |
87 | into bit 32 (=carry flag), so all of the hole bits will |
88 | be changed. |
89 | |
90 | Note: %edx == 0 in any case here. */ |
91 | |
92 | L(1): |
93 | movl (%eax), %ecx /* get word (= 4 bytes) in question */ |
94 | addl $4, %eax /* adjust pointer for *next* word */ |
95 | |
96 | subl %ecx, %edx /* first step to negate word */ |
97 | addl $magic, %ecx /* add magic word */ |
98 | |
99 | decl %edx /* complete negation of word */ |
100 | jnc L(3) /* previous addl caused overflow? */ |
101 | |
102 | xorl %ecx, %edx /* (word+magic)^word */ |
103 | |
104 | andl $~magic, %edx /* any of the carry flags set? */ |
105 | |
106 | jne L(3) /* yes => determine byte */ |
107 | |
108 | |
109 | movl (%eax), %ecx /* get word (= 4 bytes) in question */ |
110 | addl $4, %eax /* adjust pointer for *next* word */ |
111 | |
112 | subl %ecx, %edx /* first step to negate word */ |
113 | addl $magic, %ecx /* add magic word */ |
114 | |
115 | decl %edx /* complete negation of word */ |
116 | jnc L(3) /* previous addl caused overflow? */ |
117 | |
118 | xorl %ecx, %edx /* (word+magic)^word */ |
119 | |
120 | andl $~magic, %edx /* any of the carry flags set? */ |
121 | |
122 | jne L(3) /* yes => determine byte */ |
123 | |
124 | |
125 | movl (%eax), %ecx /* get word (= 4 bytes) in question */ |
126 | addl $4, %eax /* adjust pointer for *next* word */ |
127 | |
128 | subl %ecx, %edx /* first step to negate word */ |
129 | addl $magic, %ecx /* add magic word */ |
130 | |
131 | decl %edx /* complete negation of word */ |
132 | jnc L(3) /* previous addl caused overflow? */ |
133 | |
134 | xorl %ecx, %edx /* (word+magic)^word */ |
135 | |
136 | andl $~magic, %edx /* any of the carry flags set? */ |
137 | |
138 | jne L(3) /* yes => determine byte */ |
139 | |
140 | |
141 | movl (%eax), %ecx /* get word (= 4 bytes) in question */ |
142 | addl $4, %eax /* adjust pointer for *next* word */ |
143 | |
144 | subl %ecx, %edx /* first step to negate word */ |
145 | addl $magic, %ecx /* add magic word */ |
146 | |
147 | decl %edx /* complete negation of word */ |
148 | jnc L(3) /* previous addl caused overflow? */ |
149 | |
150 | xorl %ecx, %edx /* (word+magic)^word */ |
151 | |
152 | andl $~magic, %edx /* any of the carry flags set? */ |
153 | |
154 | je L(1) /* no => start loop again */ |
155 | |
156 | |
157 | L(3): subl $4, %eax /* correct too early pointer increment */ |
158 | subl $magic, %ecx |
159 | |
160 | cmpb $0, %cl /* lowest byte NUL? */ |
161 | jz L(2) /* yes => return */ |
162 | |
163 | inc %eax /* increment pointer */ |
164 | testb %ch, %ch /* second byte NUL? */ |
165 | |
166 | jz L(2) /* yes => return */ |
167 | |
168 | shrl $16, %ecx /* make upper bytes accessible */ |
169 | incl %eax /* increment pointer */ |
170 | |
171 | cmpb $0, %cl /* is third byte NUL? */ |
172 | jz L(2) /* yes => return */ |
173 | |
174 | incl %eax /* increment pointer */ |
175 | |
176 | L(2): subl STR(%esp), %eax /* now compute the length as difference |
177 | between start and terminating NUL |
178 | character */ |
179 | ret |
180 | END (strlen) |
181 | libc_hidden_builtin_def (strlen) |
182 | |