1 | /* Optimized wcslen for x86-64 with SSE2. |
2 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | .text |
22 | ENTRY (__wcslen) |
23 | cmpl $0, (%rdi) |
24 | jz L(exit_tail0) |
25 | cmpl $0, 4(%rdi) |
26 | jz L(exit_tail1) |
27 | cmpl $0, 8(%rdi) |
28 | jz L(exit_tail2) |
29 | cmpl $0, 12(%rdi) |
30 | jz L(exit_tail3) |
31 | cmpl $0, 16(%rdi) |
32 | jz L(exit_tail4) |
33 | cmpl $0, 20(%rdi) |
34 | jz L(exit_tail5) |
35 | cmpl $0, 24(%rdi) |
36 | jz L(exit_tail6) |
37 | cmpl $0, 28(%rdi) |
38 | jz L(exit_tail7) |
39 | |
40 | pxor %xmm0, %xmm0 |
41 | |
42 | lea 32(%rdi), %rax |
43 | addq $16, %rdi |
44 | and $-16, %rax |
45 | |
46 | pcmpeqd (%rax), %xmm0 |
47 | pmovmskb %xmm0, %edx |
48 | pxor %xmm1, %xmm1 |
49 | addq $16, %rax |
50 | test %edx, %edx |
51 | jnz L(exit) |
52 | |
53 | pcmpeqd (%rax), %xmm1 |
54 | pmovmskb %xmm1, %edx |
55 | pxor %xmm2, %xmm2 |
56 | addq $16, %rax |
57 | test %edx, %edx |
58 | jnz L(exit) |
59 | |
60 | pcmpeqd (%rax), %xmm2 |
61 | pmovmskb %xmm2, %edx |
62 | pxor %xmm3, %xmm3 |
63 | addq $16, %rax |
64 | test %edx, %edx |
65 | jnz L(exit) |
66 | |
67 | pcmpeqd (%rax), %xmm3 |
68 | pmovmskb %xmm3, %edx |
69 | addq $16, %rax |
70 | test %edx, %edx |
71 | jnz L(exit) |
72 | |
73 | pcmpeqd (%rax), %xmm0 |
74 | pmovmskb %xmm0, %edx |
75 | addq $16, %rax |
76 | test %edx, %edx |
77 | jnz L(exit) |
78 | |
79 | pcmpeqd (%rax), %xmm1 |
80 | pmovmskb %xmm1, %edx |
81 | addq $16, %rax |
82 | test %edx, %edx |
83 | jnz L(exit) |
84 | |
85 | pcmpeqd (%rax), %xmm2 |
86 | pmovmskb %xmm2, %edx |
87 | addq $16, %rax |
88 | test %edx, %edx |
89 | jnz L(exit) |
90 | |
91 | pcmpeqd (%rax), %xmm3 |
92 | pmovmskb %xmm3, %edx |
93 | addq $16, %rax |
94 | test %edx, %edx |
95 | jnz L(exit) |
96 | |
97 | pcmpeqd (%rax), %xmm0 |
98 | pmovmskb %xmm0, %edx |
99 | addq $16, %rax |
100 | test %edx, %edx |
101 | jnz L(exit) |
102 | |
103 | pcmpeqd (%rax), %xmm1 |
104 | pmovmskb %xmm1, %edx |
105 | addq $16, %rax |
106 | test %edx, %edx |
107 | jnz L(exit) |
108 | |
109 | pcmpeqd (%rax), %xmm2 |
110 | pmovmskb %xmm2, %edx |
111 | addq $16, %rax |
112 | test %edx, %edx |
113 | jnz L(exit) |
114 | |
115 | pcmpeqd (%rax), %xmm3 |
116 | pmovmskb %xmm3, %edx |
117 | addq $16, %rax |
118 | test %edx, %edx |
119 | jnz L(exit) |
120 | |
121 | and $-0x40, %rax |
122 | |
123 | .p2align 4 |
124 | L(aligned_64_loop): |
125 | movaps (%rax), %xmm0 |
126 | movaps 16(%rax), %xmm1 |
127 | movaps 32(%rax), %xmm2 |
128 | movaps 48(%rax), %xmm6 |
129 | |
130 | pminub %xmm1, %xmm0 |
131 | pminub %xmm6, %xmm2 |
132 | pminub %xmm0, %xmm2 |
133 | pcmpeqd %xmm3, %xmm2 |
134 | pmovmskb %xmm2, %edx |
135 | addq $64, %rax |
136 | test %edx, %edx |
137 | jz L(aligned_64_loop) |
138 | |
139 | pcmpeqd -64(%rax), %xmm3 |
140 | pmovmskb %xmm3, %edx |
141 | addq $48, %rdi |
142 | test %edx, %edx |
143 | jnz L(exit) |
144 | |
145 | pcmpeqd %xmm1, %xmm3 |
146 | pmovmskb %xmm3, %edx |
147 | addq $-16, %rdi |
148 | test %edx, %edx |
149 | jnz L(exit) |
150 | |
151 | pcmpeqd -32(%rax), %xmm3 |
152 | pmovmskb %xmm3, %edx |
153 | addq $-16, %rdi |
154 | test %edx, %edx |
155 | jnz L(exit) |
156 | |
157 | pcmpeqd %xmm6, %xmm3 |
158 | pmovmskb %xmm3, %edx |
159 | addq $-16, %rdi |
160 | test %edx, %edx |
161 | jz L(aligned_64_loop) |
162 | |
163 | .p2align 4 |
164 | L(exit): |
165 | sub %rdi, %rax |
166 | shr $2, %rax |
167 | test %dl, %dl |
168 | jz L(exit_high) |
169 | |
170 | andl $15, %edx |
171 | jz L(exit_1) |
172 | ret |
173 | |
174 | /* No align here. Naturally aligned % 16 == 1. */ |
175 | L(exit_high): |
176 | andl $(15 << 8), %edx |
177 | jz L(exit_3) |
178 | add $2, %rax |
179 | ret |
180 | |
181 | .p2align 3 |
182 | L(exit_1): |
183 | add $1, %rax |
184 | ret |
185 | |
186 | .p2align 3 |
187 | L(exit_3): |
188 | add $3, %rax |
189 | ret |
190 | |
191 | .p2align 3 |
192 | L(exit_tail0): |
193 | xorl %eax, %eax |
194 | ret |
195 | |
196 | .p2align 3 |
197 | L(exit_tail1): |
198 | movl $1, %eax |
199 | ret |
200 | |
201 | .p2align 3 |
202 | L(exit_tail2): |
203 | movl $2, %eax |
204 | ret |
205 | |
206 | .p2align 3 |
207 | L(exit_tail3): |
208 | movl $3, %eax |
209 | ret |
210 | |
211 | .p2align 3 |
212 | L(exit_tail4): |
213 | movl $4, %eax |
214 | ret |
215 | |
216 | .p2align 3 |
217 | L(exit_tail5): |
218 | movl $5, %eax |
219 | ret |
220 | |
221 | .p2align 3 |
222 | L(exit_tail6): |
223 | movl $6, %eax |
224 | ret |
225 | |
226 | .p2align 3 |
227 | L(exit_tail7): |
228 | movl $7, %eax |
229 | ret |
230 | |
231 | END (__wcslen) |
232 | |
233 | weak_alias(__wcslen, wcslen) |
234 | |