1 | /* wcslen with SSE2 |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | # include <sysdep.h> |
21 | # define STR 4 |
22 | |
23 | .text |
24 | ENTRY (__wcslen_sse2) |
25 | mov STR(%esp), %edx |
26 | |
27 | cmpl $0, (%edx) |
28 | jz L(exit_tail0) |
29 | cmpl $0, 4(%edx) |
30 | jz L(exit_tail1) |
31 | cmpl $0, 8(%edx) |
32 | jz L(exit_tail2) |
33 | cmpl $0, 12(%edx) |
34 | jz L(exit_tail3) |
35 | cmpl $0, 16(%edx) |
36 | jz L(exit_tail4) |
37 | cmpl $0, 20(%edx) |
38 | jz L(exit_tail5) |
39 | cmpl $0, 24(%edx) |
40 | jz L(exit_tail6) |
41 | cmpl $0, 28(%edx) |
42 | jz L(exit_tail7) |
43 | |
44 | pxor %xmm0, %xmm0 |
45 | |
46 | lea 32(%edx), %eax |
47 | lea 16(%edx), %ecx |
48 | and $-16, %eax |
49 | |
50 | pcmpeqd (%eax), %xmm0 |
51 | pmovmskb %xmm0, %edx |
52 | pxor %xmm1, %xmm1 |
53 | test %edx, %edx |
54 | lea 16(%eax), %eax |
55 | jnz L(exit) |
56 | |
57 | pcmpeqd (%eax), %xmm1 |
58 | pmovmskb %xmm1, %edx |
59 | pxor %xmm2, %xmm2 |
60 | test %edx, %edx |
61 | lea 16(%eax), %eax |
62 | jnz L(exit) |
63 | |
64 | pcmpeqd (%eax), %xmm2 |
65 | pmovmskb %xmm2, %edx |
66 | pxor %xmm3, %xmm3 |
67 | test %edx, %edx |
68 | lea 16(%eax), %eax |
69 | jnz L(exit) |
70 | |
71 | pcmpeqd (%eax), %xmm3 |
72 | pmovmskb %xmm3, %edx |
73 | test %edx, %edx |
74 | lea 16(%eax), %eax |
75 | jnz L(exit) |
76 | |
77 | and $-0x40, %eax |
78 | |
79 | .p2align 4 |
80 | L(aligned_64_loop): |
81 | movaps (%eax), %xmm0 |
82 | movaps 16(%eax), %xmm1 |
83 | movaps 32(%eax), %xmm2 |
84 | movaps 48(%eax), %xmm6 |
85 | |
86 | pminub %xmm1, %xmm0 |
87 | pminub %xmm6, %xmm2 |
88 | pminub %xmm0, %xmm2 |
89 | pcmpeqd %xmm3, %xmm2 |
90 | pmovmskb %xmm2, %edx |
91 | test %edx, %edx |
92 | lea 64(%eax), %eax |
93 | jz L(aligned_64_loop) |
94 | |
95 | pcmpeqd -64(%eax), %xmm3 |
96 | pmovmskb %xmm3, %edx |
97 | test %edx, %edx |
98 | lea 48(%ecx), %ecx |
99 | jnz L(exit) |
100 | |
101 | pcmpeqd %xmm1, %xmm3 |
102 | pmovmskb %xmm3, %edx |
103 | test %edx, %edx |
104 | lea -16(%ecx), %ecx |
105 | jnz L(exit) |
106 | |
107 | pcmpeqd -32(%eax), %xmm3 |
108 | pmovmskb %xmm3, %edx |
109 | test %edx, %edx |
110 | lea -16(%ecx), %ecx |
111 | jnz L(exit) |
112 | |
113 | pcmpeqd %xmm6, %xmm3 |
114 | pmovmskb %xmm3, %edx |
115 | test %edx, %edx |
116 | lea -16(%ecx), %ecx |
117 | jnz L(exit) |
118 | |
119 | jmp L(aligned_64_loop) |
120 | |
121 | .p2align 4 |
122 | L(exit): |
123 | sub %ecx, %eax |
124 | shr $2, %eax |
125 | test %dl, %dl |
126 | jz L(exit_high) |
127 | |
128 | mov %dl, %cl |
129 | and $15, %cl |
130 | jz L(exit_1) |
131 | ret |
132 | |
133 | .p2align 4 |
134 | L(exit_high): |
135 | mov %dh, %ch |
136 | and $15, %ch |
137 | jz L(exit_3) |
138 | add $2, %eax |
139 | ret |
140 | |
141 | .p2align 4 |
142 | L(exit_1): |
143 | add $1, %eax |
144 | ret |
145 | |
146 | .p2align 4 |
147 | L(exit_3): |
148 | add $3, %eax |
149 | ret |
150 | |
151 | .p2align 4 |
152 | L(exit_tail0): |
153 | xor %eax, %eax |
154 | ret |
155 | |
156 | .p2align 4 |
157 | L(exit_tail1): |
158 | mov $1, %eax |
159 | ret |
160 | |
161 | .p2align 4 |
162 | L(exit_tail2): |
163 | mov $2, %eax |
164 | ret |
165 | |
166 | .p2align 4 |
167 | L(exit_tail3): |
168 | mov $3, %eax |
169 | ret |
170 | |
171 | .p2align 4 |
172 | L(exit_tail4): |
173 | mov $4, %eax |
174 | ret |
175 | |
176 | .p2align 4 |
177 | L(exit_tail5): |
178 | mov $5, %eax |
179 | ret |
180 | |
181 | .p2align 4 |
182 | L(exit_tail6): |
183 | mov $6, %eax |
184 | ret |
185 | |
186 | .p2align 4 |
187 | L(exit_tail7): |
188 | mov $7, %eax |
189 | ret |
190 | |
191 | END (__wcslen_sse2) |
192 | #endif |
193 | |