1/* fast SSE2 memchr with 64 byte loop and pmaxub instruction using
2
3 Copyright (C) 2011-2022 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22 .text
23ENTRY (__rawmemchr)
24 movd %rsi, %xmm1
25 mov %rdi, %rcx
26
27 punpcklbw %xmm1, %xmm1
28 punpcklbw %xmm1, %xmm1
29
30 and $63, %rcx
31 pshufd $0, %xmm1, %xmm1
32
33 cmp $48, %rcx
34 ja L(crosscache)
35
36 movdqu (%rdi), %xmm0
37 pcmpeqb %xmm1, %xmm0
38/* Check if there is a match. */
39 pmovmskb %xmm0, %eax
40 test %eax, %eax
41
42 jnz L(matches)
43 add $16, %rdi
44 and $-16, %rdi
45 jmp L(loop_prolog)
46
47 .p2align 4
48L(crosscache):
49 and $15, %rcx
50 and $-16, %rdi
51 movdqa (%rdi), %xmm0
52
53 pcmpeqb %xmm1, %xmm0
54/* Check if there is a match. */
55 pmovmskb %xmm0, %eax
56/* Remove the leading bytes. */
57 sar %cl, %eax
58 test %eax, %eax
59 je L(unaligned_no_match)
60/* Check which byte is a match. */
61 bsf %eax, %eax
62
63 add %rdi, %rax
64 add %rcx, %rax
65 ret
66
67 .p2align 4
68L(unaligned_no_match):
69 add $16, %rdi
70
71 .p2align 4
72L(loop_prolog):
73 movdqa (%rdi), %xmm0
74 pcmpeqb %xmm1, %xmm0
75 pmovmskb %xmm0, %eax
76 test %eax, %eax
77 jnz L(matches)
78
79 movdqa 16(%rdi), %xmm2
80 pcmpeqb %xmm1, %xmm2
81 pmovmskb %xmm2, %eax
82 test %eax, %eax
83 jnz L(matches16)
84
85 movdqa 32(%rdi), %xmm3
86 pcmpeqb %xmm1, %xmm3
87 pmovmskb %xmm3, %eax
88 test %eax, %eax
89 jnz L(matches32)
90
91 movdqa 48(%rdi), %xmm4
92 pcmpeqb %xmm1, %xmm4
93 add $64, %rdi
94 pmovmskb %xmm4, %eax
95 test %eax, %eax
96 jnz L(matches0)
97
98 test $0x3f, %rdi
99 jz L(align64_loop)
100
101 movdqa (%rdi), %xmm0
102 pcmpeqb %xmm1, %xmm0
103 pmovmskb %xmm0, %eax
104 test %eax, %eax
105 jnz L(matches)
106
107 movdqa 16(%rdi), %xmm2
108 pcmpeqb %xmm1, %xmm2
109 pmovmskb %xmm2, %eax
110 test %eax, %eax
111 jnz L(matches16)
112
113 movdqa 32(%rdi), %xmm3
114 pcmpeqb %xmm1, %xmm3
115 pmovmskb %xmm3, %eax
116 test %eax, %eax
117 jnz L(matches32)
118
119 movdqa 48(%rdi), %xmm3
120 pcmpeqb %xmm1, %xmm3
121 pmovmskb %xmm3, %eax
122
123 add $64, %rdi
124 test %eax, %eax
125 jnz L(matches0)
126
127 and $-64, %rdi
128
129 .p2align 4
130L(align64_loop):
131 movdqa (%rdi), %xmm0
132 movdqa 16(%rdi), %xmm2
133 movdqa 32(%rdi), %xmm3
134 movdqa 48(%rdi), %xmm4
135
136 pcmpeqb %xmm1, %xmm0
137 pcmpeqb %xmm1, %xmm2
138 pcmpeqb %xmm1, %xmm3
139 pcmpeqb %xmm1, %xmm4
140
141 pmaxub %xmm0, %xmm3
142 pmaxub %xmm2, %xmm4
143 pmaxub %xmm3, %xmm4
144 pmovmskb %xmm4, %eax
145
146 add $64, %rdi
147
148 test %eax, %eax
149 jz L(align64_loop)
150
151 sub $64, %rdi
152
153 pmovmskb %xmm0, %eax
154 test %eax, %eax
155 jnz L(matches)
156
157 pmovmskb %xmm2, %eax
158 test %eax, %eax
159 jnz L(matches16)
160
161 movdqa 32(%rdi), %xmm3
162 pcmpeqb %xmm1, %xmm3
163
164 pcmpeqb 48(%rdi), %xmm1
165 pmovmskb %xmm3, %eax
166 test %eax, %eax
167 jnz L(matches32)
168
169 pmovmskb %xmm1, %eax
170 bsf %eax, %eax
171 lea 48(%rdi, %rax), %rax
172 ret
173
174 .p2align 4
175L(matches0):
176 bsf %eax, %eax
177 lea -16(%rax, %rdi), %rax
178 ret
179
180 .p2align 4
181L(matches):
182 bsf %eax, %eax
183 add %rdi, %rax
184 ret
185
186 .p2align 4
187L(matches16):
188 bsf %eax, %eax
189 lea 16(%rax, %rdi), %rax
190 ret
191
192 .p2align 4
193L(matches32):
194 bsf %eax, %eax
195 lea 32(%rax, %rdi), %rax
196 ret
197
198END (__rawmemchr)
199
200weak_alias (__rawmemchr, rawmemchr)
201libc_hidden_builtin_def (__rawmemchr)
202

source code of glibc/sysdeps/x86_64/rawmemchr.S