1 | /* fast SSE2 memchr with 64 byte loop and pmaxub instruction using |
2 | |
3 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | .text |
23 | ENTRY (__rawmemchr) |
24 | movd %rsi, %xmm1 |
25 | mov %rdi, %rcx |
26 | |
27 | punpcklbw %xmm1, %xmm1 |
28 | punpcklbw %xmm1, %xmm1 |
29 | |
30 | and $63, %rcx |
31 | pshufd $0, %xmm1, %xmm1 |
32 | |
33 | cmp $48, %rcx |
34 | ja L(crosscache) |
35 | |
36 | movdqu (%rdi), %xmm0 |
37 | pcmpeqb %xmm1, %xmm0 |
38 | /* Check if there is a match. */ |
39 | pmovmskb %xmm0, %eax |
40 | test %eax, %eax |
41 | |
42 | jnz L(matches) |
43 | add $16, %rdi |
44 | and $-16, %rdi |
45 | jmp L(loop_prolog) |
46 | |
47 | .p2align 4 |
48 | L(crosscache): |
49 | and $15, %rcx |
50 | and $-16, %rdi |
51 | movdqa (%rdi), %xmm0 |
52 | |
53 | pcmpeqb %xmm1, %xmm0 |
54 | /* Check if there is a match. */ |
55 | pmovmskb %xmm0, %eax |
56 | /* Remove the leading bytes. */ |
57 | sar %cl, %eax |
58 | test %eax, %eax |
59 | je L(unaligned_no_match) |
60 | /* Check which byte is a match. */ |
61 | bsf %eax, %eax |
62 | |
63 | add %rdi, %rax |
64 | add %rcx, %rax |
65 | ret |
66 | |
67 | .p2align 4 |
68 | L(unaligned_no_match): |
69 | add $16, %rdi |
70 | |
71 | .p2align 4 |
72 | L(loop_prolog): |
73 | movdqa (%rdi), %xmm0 |
74 | pcmpeqb %xmm1, %xmm0 |
75 | pmovmskb %xmm0, %eax |
76 | test %eax, %eax |
77 | jnz L(matches) |
78 | |
79 | movdqa 16(%rdi), %xmm2 |
80 | pcmpeqb %xmm1, %xmm2 |
81 | pmovmskb %xmm2, %eax |
82 | test %eax, %eax |
83 | jnz L(matches16) |
84 | |
85 | movdqa 32(%rdi), %xmm3 |
86 | pcmpeqb %xmm1, %xmm3 |
87 | pmovmskb %xmm3, %eax |
88 | test %eax, %eax |
89 | jnz L(matches32) |
90 | |
91 | movdqa 48(%rdi), %xmm4 |
92 | pcmpeqb %xmm1, %xmm4 |
93 | add $64, %rdi |
94 | pmovmskb %xmm4, %eax |
95 | test %eax, %eax |
96 | jnz L(matches0) |
97 | |
98 | test $0x3f, %rdi |
99 | jz L(align64_loop) |
100 | |
101 | movdqa (%rdi), %xmm0 |
102 | pcmpeqb %xmm1, %xmm0 |
103 | pmovmskb %xmm0, %eax |
104 | test %eax, %eax |
105 | jnz L(matches) |
106 | |
107 | movdqa 16(%rdi), %xmm2 |
108 | pcmpeqb %xmm1, %xmm2 |
109 | pmovmskb %xmm2, %eax |
110 | test %eax, %eax |
111 | jnz L(matches16) |
112 | |
113 | movdqa 32(%rdi), %xmm3 |
114 | pcmpeqb %xmm1, %xmm3 |
115 | pmovmskb %xmm3, %eax |
116 | test %eax, %eax |
117 | jnz L(matches32) |
118 | |
119 | movdqa 48(%rdi), %xmm3 |
120 | pcmpeqb %xmm1, %xmm3 |
121 | pmovmskb %xmm3, %eax |
122 | |
123 | add $64, %rdi |
124 | test %eax, %eax |
125 | jnz L(matches0) |
126 | |
127 | and $-64, %rdi |
128 | |
129 | .p2align 4 |
130 | L(align64_loop): |
131 | movdqa (%rdi), %xmm0 |
132 | movdqa 16(%rdi), %xmm2 |
133 | movdqa 32(%rdi), %xmm3 |
134 | movdqa 48(%rdi), %xmm4 |
135 | |
136 | pcmpeqb %xmm1, %xmm0 |
137 | pcmpeqb %xmm1, %xmm2 |
138 | pcmpeqb %xmm1, %xmm3 |
139 | pcmpeqb %xmm1, %xmm4 |
140 | |
141 | pmaxub %xmm0, %xmm3 |
142 | pmaxub %xmm2, %xmm4 |
143 | pmaxub %xmm3, %xmm4 |
144 | pmovmskb %xmm4, %eax |
145 | |
146 | add $64, %rdi |
147 | |
148 | test %eax, %eax |
149 | jz L(align64_loop) |
150 | |
151 | sub $64, %rdi |
152 | |
153 | pmovmskb %xmm0, %eax |
154 | test %eax, %eax |
155 | jnz L(matches) |
156 | |
157 | pmovmskb %xmm2, %eax |
158 | test %eax, %eax |
159 | jnz L(matches16) |
160 | |
161 | movdqa 32(%rdi), %xmm3 |
162 | pcmpeqb %xmm1, %xmm3 |
163 | |
164 | pcmpeqb 48(%rdi), %xmm1 |
165 | pmovmskb %xmm3, %eax |
166 | test %eax, %eax |
167 | jnz L(matches32) |
168 | |
169 | pmovmskb %xmm1, %eax |
170 | bsf %eax, %eax |
171 | lea 48(%rdi, %rax), %rax |
172 | ret |
173 | |
174 | .p2align 4 |
175 | L(matches0): |
176 | bsf %eax, %eax |
177 | lea -16(%rax, %rdi), %rax |
178 | ret |
179 | |
180 | .p2align 4 |
181 | L(matches): |
182 | bsf %eax, %eax |
183 | add %rdi, %rax |
184 | ret |
185 | |
186 | .p2align 4 |
187 | L(matches16): |
188 | bsf %eax, %eax |
189 | lea 16(%rax, %rdi), %rax |
190 | ret |
191 | |
192 | .p2align 4 |
193 | L(matches32): |
194 | bsf %eax, %eax |
195 | lea 32(%rax, %rdi), %rax |
196 | ret |
197 | |
198 | END (__rawmemchr) |
199 | |
200 | weak_alias (__rawmemchr, rawmemchr) |
201 | libc_hidden_builtin_def (__rawmemchr) |
202 | |