1/* Optimized wcslen for x86-64 with SSE2.
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21 .text
22ENTRY (__wcslen)
23 cmpl $0, (%rdi)
24 jz L(exit_tail0)
25 cmpl $0, 4(%rdi)
26 jz L(exit_tail1)
27 cmpl $0, 8(%rdi)
28 jz L(exit_tail2)
29 cmpl $0, 12(%rdi)
30 jz L(exit_tail3)
31 cmpl $0, 16(%rdi)
32 jz L(exit_tail4)
33 cmpl $0, 20(%rdi)
34 jz L(exit_tail5)
35 cmpl $0, 24(%rdi)
36 jz L(exit_tail6)
37 cmpl $0, 28(%rdi)
38 jz L(exit_tail7)
39
40 pxor %xmm0, %xmm0
41
42 lea 32(%rdi), %rax
43 addq $16, %rdi
44 and $-16, %rax
45
46 pcmpeqd (%rax), %xmm0
47 pmovmskb %xmm0, %edx
48 pxor %xmm1, %xmm1
49 addq $16, %rax
50 test %edx, %edx
51 jnz L(exit)
52
53 pcmpeqd (%rax), %xmm1
54 pmovmskb %xmm1, %edx
55 pxor %xmm2, %xmm2
56 addq $16, %rax
57 test %edx, %edx
58 jnz L(exit)
59
60 pcmpeqd (%rax), %xmm2
61 pmovmskb %xmm2, %edx
62 pxor %xmm3, %xmm3
63 addq $16, %rax
64 test %edx, %edx
65 jnz L(exit)
66
67 pcmpeqd (%rax), %xmm3
68 pmovmskb %xmm3, %edx
69 addq $16, %rax
70 test %edx, %edx
71 jnz L(exit)
72
73 pcmpeqd (%rax), %xmm0
74 pmovmskb %xmm0, %edx
75 addq $16, %rax
76 test %edx, %edx
77 jnz L(exit)
78
79 pcmpeqd (%rax), %xmm1
80 pmovmskb %xmm1, %edx
81 addq $16, %rax
82 test %edx, %edx
83 jnz L(exit)
84
85 pcmpeqd (%rax), %xmm2
86 pmovmskb %xmm2, %edx
87 addq $16, %rax
88 test %edx, %edx
89 jnz L(exit)
90
91 pcmpeqd (%rax), %xmm3
92 pmovmskb %xmm3, %edx
93 addq $16, %rax
94 test %edx, %edx
95 jnz L(exit)
96
97 pcmpeqd (%rax), %xmm0
98 pmovmskb %xmm0, %edx
99 addq $16, %rax
100 test %edx, %edx
101 jnz L(exit)
102
103 pcmpeqd (%rax), %xmm1
104 pmovmskb %xmm1, %edx
105 addq $16, %rax
106 test %edx, %edx
107 jnz L(exit)
108
109 pcmpeqd (%rax), %xmm2
110 pmovmskb %xmm2, %edx
111 addq $16, %rax
112 test %edx, %edx
113 jnz L(exit)
114
115 pcmpeqd (%rax), %xmm3
116 pmovmskb %xmm3, %edx
117 addq $16, %rax
118 test %edx, %edx
119 jnz L(exit)
120
121 and $-0x40, %rax
122
123 .p2align 4
124L(aligned_64_loop):
125 movaps (%rax), %xmm0
126 movaps 16(%rax), %xmm1
127 movaps 32(%rax), %xmm2
128 movaps 48(%rax), %xmm6
129
130 pminub %xmm1, %xmm0
131 pminub %xmm6, %xmm2
132 pminub %xmm0, %xmm2
133 pcmpeqd %xmm3, %xmm2
134 pmovmskb %xmm2, %edx
135 addq $64, %rax
136 test %edx, %edx
137 jz L(aligned_64_loop)
138
139 pcmpeqd -64(%rax), %xmm3
140 pmovmskb %xmm3, %edx
141 addq $48, %rdi
142 test %edx, %edx
143 jnz L(exit)
144
145 pcmpeqd %xmm1, %xmm3
146 pmovmskb %xmm3, %edx
147 addq $-16, %rdi
148 test %edx, %edx
149 jnz L(exit)
150
151 pcmpeqd -32(%rax), %xmm3
152 pmovmskb %xmm3, %edx
153 addq $-16, %rdi
154 test %edx, %edx
155 jnz L(exit)
156
157 pcmpeqd %xmm6, %xmm3
158 pmovmskb %xmm3, %edx
159 addq $-16, %rdi
160 test %edx, %edx
161 jz L(aligned_64_loop)
162
163 .p2align 4
164L(exit):
165 sub %rdi, %rax
166 shr $2, %rax
167 test %dl, %dl
168 jz L(exit_high)
169
170 andl $15, %edx
171 jz L(exit_1)
172 ret
173
174 /* No align here. Naturally aligned % 16 == 1. */
175L(exit_high):
176 andl $(15 << 8), %edx
177 jz L(exit_3)
178 add $2, %rax
179 ret
180
181 .p2align 3
182L(exit_1):
183 add $1, %rax
184 ret
185
186 .p2align 3
187L(exit_3):
188 add $3, %rax
189 ret
190
191 .p2align 3
192L(exit_tail0):
193 xorl %eax, %eax
194 ret
195
196 .p2align 3
197L(exit_tail1):
198 movl $1, %eax
199 ret
200
201 .p2align 3
202L(exit_tail2):
203 movl $2, %eax
204 ret
205
206 .p2align 3
207L(exit_tail3):
208 movl $3, %eax
209 ret
210
211 .p2align 3
212L(exit_tail4):
213 movl $4, %eax
214 ret
215
216 .p2align 3
217L(exit_tail5):
218 movl $5, %eax
219 ret
220
221 .p2align 3
222L(exit_tail6):
223 movl $6, %eax
224 ret
225
226 .p2align 3
227L(exit_tail7):
228 movl $7, %eax
229 ret
230
231END (__wcslen)
232
233weak_alias(__wcslen, wcslen)
234

source code of glibc/sysdeps/x86_64/wcslen.S