1/* strstr with unaligned loads
2 Copyright (C) 2009-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21ENTRY(__strstr_sse2_unaligned)
22 movzbl (%rsi), %eax
23 testb %al, %al
24 je L(empty)
25 movzbl 1(%rsi), %edx
26 testb %dl, %dl
27 je L(strchr)
28 movd %eax, %xmm1
29 movd %edx, %xmm2
30 movq %rdi, %rax
31 andl $4095, %eax
32 punpcklbw %xmm1, %xmm1
33 cmpq $4031, %rax
34 punpcklbw %xmm2, %xmm2
35 punpcklwd %xmm1, %xmm1
36 punpcklwd %xmm2, %xmm2
37 pshufd $0, %xmm1, %xmm1
38 pshufd $0, %xmm2, %xmm2
39 ja L(cross_page)
40 movdqu (%rdi), %xmm3
41 pxor %xmm5, %xmm5
42 movdqu 1(%rdi), %xmm4
43 movdqa %xmm3, %xmm6
44 pcmpeqb %xmm1, %xmm3
45 pcmpeqb %xmm2, %xmm4
46 movdqu 16(%rdi), %xmm0
47 pcmpeqb %xmm5, %xmm6
48 pminub %xmm4, %xmm3
49 movdqa %xmm3, %xmm4
50 movdqu 17(%rdi), %xmm3
51 pcmpeqb %xmm0, %xmm5
52 pcmpeqb %xmm2, %xmm3
53 por %xmm6, %xmm4
54 pcmpeqb %xmm1, %xmm0
55 pminub %xmm3, %xmm0
56 por %xmm5, %xmm0
57 pmovmskb %xmm4, %r8d
58 pmovmskb %xmm0, %eax
59 salq $16, %rax
60 orq %rax, %r8
61 je L(next_32_bytes)
62L(next_pair_index):
63 bsf %r8, %rax
64 addq %rdi, %rax
65 cmpb $0, (%rax)
66 je L(zero1)
67 movzbl 2(%rsi), %edx
68 testb %dl, %dl
69 je L(found1)
70 cmpb 2(%rax), %dl
71 jne L(next_pair)
72 xorl %edx, %edx
73 jmp L(pair_loop_start)
74
75 .p2align 4
76L(strchr):
77 movzbl %al, %esi
78 jmp __strchr_sse2
79
80 .p2align 4
81L(pair_loop):
82 addq $1, %rdx
83 cmpb 2(%rax,%rdx), %cl
84 jne L(next_pair)
85L(pair_loop_start):
86 movzbl 3(%rsi,%rdx), %ecx
87 testb %cl, %cl
88 jne L(pair_loop)
89L(found1):
90 ret
91L(zero1):
92 xorl %eax, %eax
93 ret
94
95 .p2align 4
96L(next_pair):
97 leaq -1(%r8), %rax
98 andq %rax, %r8
99 jne L(next_pair_index)
100
101 .p2align 4
102L(next_32_bytes):
103 movdqu 32(%rdi), %xmm3
104 pxor %xmm5, %xmm5
105 movdqu 33(%rdi), %xmm4
106 movdqa %xmm3, %xmm6
107 pcmpeqb %xmm1, %xmm3
108 pcmpeqb %xmm2, %xmm4
109 movdqu 48(%rdi), %xmm0
110 pcmpeqb %xmm5, %xmm6
111 pminub %xmm4, %xmm3
112 movdqa %xmm3, %xmm4
113 movdqu 49(%rdi), %xmm3
114 pcmpeqb %xmm0, %xmm5
115 pcmpeqb %xmm2, %xmm3
116 por %xmm6, %xmm4
117 pcmpeqb %xmm1, %xmm0
118 pminub %xmm3, %xmm0
119 por %xmm5, %xmm0
120 pmovmskb %xmm4, %eax
121 salq $32, %rax
122 pmovmskb %xmm0, %r8d
123 salq $48, %r8
124 orq %rax, %r8
125 je L(loop_header)
126L(next_pair2_index):
127 bsfq %r8, %rax
128 addq %rdi, %rax
129 cmpb $0, (%rax)
130 je L(zero2)
131 movzbl 2(%rsi), %edx
132 testb %dl, %dl
133 je L(found2)
134 cmpb 2(%rax), %dl
135 jne L(next_pair2)
136 xorl %edx, %edx
137 jmp L(pair_loop2_start)
138
139 .p2align 4
140L(pair_loop2):
141 addq $1, %rdx
142 cmpb 2(%rax,%rdx), %cl
143 jne L(next_pair2)
144L(pair_loop2_start):
145 movzbl 3(%rsi,%rdx), %ecx
146 testb %cl, %cl
147 jne L(pair_loop2)
148L(found2):
149 ret
150 L(zero2):
151 xorl %eax, %eax
152 ret
153L(empty):
154 mov %rdi, %rax
155 ret
156
157 .p2align 4
158L(next_pair2):
159 leaq -1(%r8), %rax
160 andq %rax, %r8
161 jne L(next_pair2_index)
162L(loop_header):
163 movq $-512, %r11
164 movq %rdi, %r9
165
166 pxor %xmm7, %xmm7
167 andq $-64, %rdi
168
169 .p2align 4
170L(loop):
171 movdqa 64(%rdi), %xmm3
172 movdqu 63(%rdi), %xmm6
173 movdqa %xmm3, %xmm0
174 pxor %xmm2, %xmm3
175 pxor %xmm1, %xmm6
176 movdqa 80(%rdi), %xmm10
177 por %xmm3, %xmm6
178 pminub %xmm10, %xmm0
179 movdqu 79(%rdi), %xmm3
180 pxor %xmm2, %xmm10
181 pxor %xmm1, %xmm3
182 movdqa 96(%rdi), %xmm9
183 por %xmm10, %xmm3
184 pminub %xmm9, %xmm0
185 pxor %xmm2, %xmm9
186 movdqa 112(%rdi), %xmm8
187 addq $64, %rdi
188 pminub %xmm6, %xmm3
189 movdqu 31(%rdi), %xmm4
190 pminub %xmm8, %xmm0
191 pxor %xmm2, %xmm8
192 pxor %xmm1, %xmm4
193 por %xmm9, %xmm4
194 pminub %xmm4, %xmm3
195 movdqu 47(%rdi), %xmm5
196 pxor %xmm1, %xmm5
197 por %xmm8, %xmm5
198 pminub %xmm5, %xmm3
199 pminub %xmm3, %xmm0
200 pcmpeqb %xmm7, %xmm0
201 pmovmskb %xmm0, %eax
202 testl %eax, %eax
203 je L(loop)
204 pminub (%rdi), %xmm6
205 pminub 32(%rdi),%xmm4
206 pminub 48(%rdi),%xmm5
207 pcmpeqb %xmm7, %xmm6
208 pcmpeqb %xmm7, %xmm5
209 pmovmskb %xmm6, %edx
210 movdqa 16(%rdi), %xmm8
211 pcmpeqb %xmm7, %xmm4
212 movdqu 15(%rdi), %xmm0
213 pmovmskb %xmm5, %r8d
214 movdqa %xmm8, %xmm3
215 pmovmskb %xmm4, %ecx
216 pcmpeqb %xmm1,%xmm0
217 pcmpeqb %xmm2,%xmm3
218 salq $32, %rcx
219 pcmpeqb %xmm7,%xmm8
220 salq $48, %r8
221 pminub %xmm0,%xmm3
222 orq %rcx, %rdx
223 por %xmm3,%xmm8
224 orq %rdx, %r8
225 pmovmskb %xmm8, %eax
226 salq $16, %rax
227 orq %rax, %r8
228 je L(loop)
229L(next_pair_index3):
230 bsfq %r8, %rcx
231 addq %rdi, %rcx
232 cmpb $0, (%rcx)
233 je L(zero)
234 xorl %eax, %eax
235 movzbl 2(%rsi), %edx
236 testb %dl, %dl
237 je L(success3)
238 cmpb 1(%rcx), %dl
239 jne L(next_pair3)
240 jmp L(pair_loop_start3)
241
242 .p2align 4
243L(pair_loop3):
244 addq $1, %rax
245 cmpb 1(%rcx,%rax), %dl
246 jne L(next_pair3)
247L(pair_loop_start3):
248 movzbl 3(%rsi,%rax), %edx
249 testb %dl, %dl
250 jne L(pair_loop3)
251L(success3):
252 lea -1(%rcx), %rax
253 ret
254
255 .p2align 4
256L(next_pair3):
257 addq %rax, %r11
258 movq %rdi, %rax
259 subq %r9, %rax
260 cmpq %r11, %rax
261 jl L(switch_strstr)
262 leaq -1(%r8), %rax
263 andq %rax, %r8
264 jne L(next_pair_index3)
265 jmp L(loop)
266
267 .p2align 4
268L(switch_strstr):
269 movq %rdi, %rdi
270 jmp __strstr_sse2
271
272 .p2align 4
273L(cross_page):
274
275 movq %rdi, %rax
276 pxor %xmm0, %xmm0
277 andq $-64, %rax
278 movdqa (%rax), %xmm3
279 movdqu -1(%rax), %xmm4
280 movdqa %xmm3, %xmm8
281 movdqa 16(%rax), %xmm5
282 pcmpeqb %xmm1, %xmm4
283 pcmpeqb %xmm0, %xmm8
284 pcmpeqb %xmm2, %xmm3
285 movdqa %xmm5, %xmm7
286 pminub %xmm4, %xmm3
287 movdqu 15(%rax), %xmm4
288 pcmpeqb %xmm0, %xmm7
289 por %xmm3, %xmm8
290 movdqa %xmm5, %xmm3
291 movdqa 32(%rax), %xmm5
292 pcmpeqb %xmm1, %xmm4
293 pcmpeqb %xmm2, %xmm3
294 movdqa %xmm5, %xmm6
295 pmovmskb %xmm8, %ecx
296 pminub %xmm4, %xmm3
297 movdqu 31(%rax), %xmm4
298 por %xmm3, %xmm7
299 movdqa %xmm5, %xmm3
300 pcmpeqb %xmm0, %xmm6
301 movdqa 48(%rax), %xmm5
302 pcmpeqb %xmm1, %xmm4
303 pmovmskb %xmm7, %r8d
304 pcmpeqb %xmm2, %xmm3
305 pcmpeqb %xmm5, %xmm0
306 pminub %xmm4, %xmm3
307 movdqu 47(%rax), %xmm4
308 por %xmm3, %xmm6
309 movdqa %xmm5, %xmm3
310 salq $16, %r8
311 pcmpeqb %xmm1, %xmm4
312 pcmpeqb %xmm2, %xmm3
313 pmovmskb %xmm6, %r10d
314 pminub %xmm4, %xmm3
315 por %xmm3, %xmm0
316 salq $32, %r10
317 orq %r10, %r8
318 orq %rcx, %r8
319 movl %edi, %ecx
320 pmovmskb %xmm0, %edx
321 subl %eax, %ecx
322 salq $48, %rdx
323 orq %rdx, %r8
324 shrq %cl, %r8
325 je L(loop_header)
326L(next_pair_index4):
327 bsfq %r8, %rax
328 addq %rdi, %rax
329 cmpb $0, (%rax)
330 je L(zero)
331
332 cmpq %rax,%rdi
333 je L(next_pair4)
334
335 movzbl 2(%rsi), %edx
336 testb %dl, %dl
337 je L(found3)
338 cmpb 1(%rax), %dl
339 jne L(next_pair4)
340 xorl %edx, %edx
341 jmp L(pair_loop_start4)
342
343 .p2align 4
344L(pair_loop4):
345 addq $1, %rdx
346 cmpb 1(%rax,%rdx), %cl
347 jne L(next_pair4)
348L(pair_loop_start4):
349 movzbl 3(%rsi,%rdx), %ecx
350 testb %cl, %cl
351 jne L(pair_loop4)
352L(found3):
353 subq $1, %rax
354 ret
355
356 .p2align 4
357L(next_pair4):
358 leaq -1(%r8), %rax
359 andq %rax, %r8
360 jne L(next_pair_index4)
361 jmp L(loop_header)
362
363 .p2align 4
364L(found):
365 rep
366 ret
367
368 .p2align 4
369L(zero):
370 xorl %eax, %eax
371 ret
372
373
374END(__strstr_sse2_unaligned)
375

source code of glibc/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S