1 | /* memchr/wmemchr optimized with AVX2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef MEMCHR |
24 | # define MEMCHR __memchr_avx2 |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WMEMCHR |
28 | # define VPCMPEQ vpcmpeqd |
29 | # define VPBROADCAST vpbroadcastd |
30 | # define CHAR_SIZE 4 |
31 | # else |
32 | # define VPCMPEQ vpcmpeqb |
33 | # define VPBROADCAST vpbroadcastb |
34 | # define CHAR_SIZE 1 |
35 | # endif |
36 | |
37 | # ifdef USE_AS_RAWMEMCHR |
38 | # define ERAW_PTR_REG ecx |
39 | # define RRAW_PTR_REG rcx |
40 | # define ALGN_PTR_REG rdi |
41 | # else |
42 | # define ERAW_PTR_REG edi |
43 | # define RRAW_PTR_REG rdi |
44 | # define ALGN_PTR_REG rcx |
45 | # endif |
46 | |
47 | # ifndef VZEROUPPER |
48 | # define VZEROUPPER vzeroupper |
49 | # endif |
50 | |
51 | # ifndef SECTION |
52 | # define SECTION(p) p##.avx |
53 | # endif |
54 | |
55 | # define VEC_SIZE 32 |
56 | # define PAGE_SIZE 4096 |
57 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
58 | |
59 | .section SECTION(.text),"ax" ,@progbits |
60 | ENTRY_P2ALIGN (MEMCHR, 5) |
61 | # ifndef USE_AS_RAWMEMCHR |
62 | /* Check for zero length. */ |
63 | # ifdef __ILP32__ |
64 | /* Clear upper bits. */ |
65 | and %RDX_LP, %RDX_LP |
66 | # else |
67 | test %RDX_LP, %RDX_LP |
68 | # endif |
69 | jz L(null) |
70 | # endif |
71 | /* Broadcast CHAR to YMMMATCH. */ |
72 | vmovd %esi, %xmm0 |
73 | VPBROADCAST %xmm0, %ymm0 |
74 | /* Check if we may cross page boundary with one vector load. */ |
75 | movl %edi, %eax |
76 | andl $(PAGE_SIZE - 1), %eax |
77 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
78 | ja L(cross_page_boundary) |
79 | |
80 | /* Check the first VEC_SIZE bytes. */ |
81 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
82 | vpmovmskb %ymm1, %eax |
83 | # ifndef USE_AS_RAWMEMCHR |
84 | /* If length < CHAR_PER_VEC handle special. */ |
85 | cmpq $CHAR_PER_VEC, %rdx |
86 | jbe L(first_vec_x0) |
87 | # endif |
88 | testl %eax, %eax |
89 | jz L(aligned_more) |
90 | bsfl %eax, %eax |
91 | addq %rdi, %rax |
92 | L(return_vzeroupper): |
93 | ZERO_UPPER_VEC_REGISTERS_RETURN |
94 | |
95 | |
96 | # ifndef USE_AS_RAWMEMCHR |
97 | .p2align 4 |
98 | L(first_vec_x0): |
99 | /* Check if first match was before length. */ |
100 | tzcntl %eax, %eax |
101 | # ifdef USE_AS_WMEMCHR |
102 | /* NB: Multiply length by 4 to get byte count. */ |
103 | sall $2, %edx |
104 | # endif |
105 | COND_VZEROUPPER |
106 | /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch |
107 | block. branch here as opposed to cmovcc is not that costly. Common |
108 | usage of memchr is to check if the return was NULL (if string was |
109 | known to contain CHAR user would use rawmemchr). This branch will be |
110 | highly correlated with the user branch and can be used by most |
111 | modern branch predictors to predict the user branch. */ |
112 | cmpl %eax, %edx |
113 | jle L(null) |
114 | addq %rdi, %rax |
115 | ret |
116 | # endif |
117 | |
118 | .p2align 4,, 10 |
119 | L(first_vec_x1): |
120 | bsfl %eax, %eax |
121 | incq %rdi |
122 | addq %rdi, %rax |
123 | VZEROUPPER_RETURN |
124 | # ifndef USE_AS_RAWMEMCHR |
125 | /* First in aligning bytes here. */ |
126 | L(null): |
127 | xorl %eax, %eax |
128 | ret |
129 | # endif |
130 | .p2align 4 |
131 | L(first_vec_x2): |
132 | tzcntl %eax, %eax |
133 | addq $(VEC_SIZE + 1), %rdi |
134 | addq %rdi, %rax |
135 | VZEROUPPER_RETURN |
136 | |
137 | .p2align 4 |
138 | L(first_vec_x3): |
139 | tzcntl %eax, %eax |
140 | addq $(VEC_SIZE * 2 + 1), %rdi |
141 | addq %rdi, %rax |
142 | VZEROUPPER_RETURN |
143 | |
144 | |
145 | .p2align 4 |
146 | L(first_vec_x4): |
147 | tzcntl %eax, %eax |
148 | addq $(VEC_SIZE * 3 + 1), %rdi |
149 | addq %rdi, %rax |
150 | VZEROUPPER_RETURN |
151 | |
152 | .p2align 4 |
153 | L(aligned_more): |
154 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
155 | since data is only aligned to VEC_SIZE. */ |
156 | |
157 | # ifndef USE_AS_RAWMEMCHR |
158 | L(cross_page_continue): |
159 | /* Align data to VEC_SIZE - 1. */ |
160 | xorl %ecx, %ecx |
161 | subl %edi, %ecx |
162 | orq $(VEC_SIZE - 1), %rdi |
163 | /* esi is for adjusting length to see if near the end. */ |
164 | leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi |
165 | # ifdef USE_AS_WMEMCHR |
166 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
167 | sarl $2, %esi |
168 | # endif |
169 | # else |
170 | orq $(VEC_SIZE - 1), %rdi |
171 | L(cross_page_continue): |
172 | # endif |
173 | /* Load first VEC regardless. */ |
174 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |
175 | vpmovmskb %ymm1, %eax |
176 | # ifndef USE_AS_RAWMEMCHR |
177 | /* Adjust length. If near end handle specially. */ |
178 | subq %rsi, %rdx |
179 | jbe L(last_4x_vec_or_less) |
180 | # endif |
181 | testl %eax, %eax |
182 | jnz L(first_vec_x1) |
183 | |
184 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
185 | vpmovmskb %ymm1, %eax |
186 | testl %eax, %eax |
187 | jnz L(first_vec_x2) |
188 | |
189 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |
190 | vpmovmskb %ymm1, %eax |
191 | testl %eax, %eax |
192 | jnz L(first_vec_x3) |
193 | |
194 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |
195 | vpmovmskb %ymm1, %eax |
196 | testl %eax, %eax |
197 | jnz L(first_vec_x4) |
198 | |
199 | # ifndef USE_AS_RAWMEMCHR |
200 | /* Check if at last VEC_SIZE * 4 length. */ |
201 | subq $(CHAR_PER_VEC * 4), %rdx |
202 | jbe L(last_4x_vec_or_less_cmpeq) |
203 | /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust |
204 | length. */ |
205 | incq %rdi |
206 | movl %edi, %ecx |
207 | orq $(VEC_SIZE * 4 - 1), %rdi |
208 | andl $(VEC_SIZE * 4 - 1), %ecx |
209 | # ifdef USE_AS_WMEMCHR |
210 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
211 | sarl $2, %ecx |
212 | # endif |
213 | addq %rcx, %rdx |
214 | # else |
215 | /* Align data to VEC_SIZE * 4 - 1 for loop. */ |
216 | incq %rdi |
217 | orq $(VEC_SIZE * 4 - 1), %rdi |
218 | # endif |
219 | |
220 | /* Compare 4 * VEC at a time forward. */ |
221 | .p2align 4 |
222 | L(loop_4x_vec): |
223 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |
224 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 |
225 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 |
226 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 |
227 | vpor %ymm1, %ymm2, %ymm5 |
228 | vpor %ymm3, %ymm4, %ymm6 |
229 | vpor %ymm5, %ymm6, %ymm5 |
230 | |
231 | vpmovmskb %ymm5, %ecx |
232 | # ifdef USE_AS_RAWMEMCHR |
233 | subq $-(VEC_SIZE * 4), %rdi |
234 | testl %ecx, %ecx |
235 | jz L(loop_4x_vec) |
236 | # else |
237 | testl %ecx, %ecx |
238 | jnz L(loop_4x_vec_end) |
239 | |
240 | subq $-(VEC_SIZE * 4), %rdi |
241 | |
242 | subq $(CHAR_PER_VEC * 4), %rdx |
243 | ja L(loop_4x_vec) |
244 | |
245 | /* Fall through into less than 4 remaining vectors of length |
246 | case. */ |
247 | VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 |
248 | vpmovmskb %ymm1, %eax |
249 | .p2align 4 |
250 | L(last_4x_vec_or_less): |
251 | # ifdef USE_AS_WMEMCHR |
252 | /* NB: Multiply length by 4 to get byte count. */ |
253 | sall $2, %edx |
254 | # endif |
255 | /* Check if first VEC contained match. */ |
256 | testl %eax, %eax |
257 | jnz L(first_vec_x1_check) |
258 | |
259 | /* If remaining length > VEC_SIZE * 2. */ |
260 | addl $(VEC_SIZE * 2), %edx |
261 | jg L(last_4x_vec) |
262 | |
263 | L(last_2x_vec): |
264 | /* If remaining length < VEC_SIZE. */ |
265 | addl $VEC_SIZE, %edx |
266 | jle L(zero_end) |
267 | |
268 | /* Check VEC2 and compare any match with remaining length. */ |
269 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
270 | vpmovmskb %ymm1, %eax |
271 | tzcntl %eax, %eax |
272 | cmpl %eax, %edx |
273 | jbe L(set_zero_end) |
274 | addq $(VEC_SIZE + 1), %rdi |
275 | addq %rdi, %rax |
276 | L(zero_end): |
277 | VZEROUPPER_RETURN |
278 | |
279 | .p2align 4 |
280 | L(loop_4x_vec_end): |
281 | # endif |
282 | /* rawmemchr will fall through into this if match was found in |
283 | loop. */ |
284 | |
285 | vpmovmskb %ymm1, %eax |
286 | testl %eax, %eax |
287 | jnz L(last_vec_x1_return) |
288 | |
289 | vpmovmskb %ymm2, %eax |
290 | testl %eax, %eax |
291 | jnz L(last_vec_x2_return) |
292 | |
293 | vpmovmskb %ymm3, %eax |
294 | /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ |
295 | salq $32, %rcx |
296 | orq %rcx, %rax |
297 | tzcntq %rax, %rax |
298 | # ifdef USE_AS_RAWMEMCHR |
299 | subq $(VEC_SIZE * 2 - 1), %rdi |
300 | # else |
301 | subq $-(VEC_SIZE * 2 + 1), %rdi |
302 | # endif |
303 | addq %rdi, %rax |
304 | VZEROUPPER_RETURN |
305 | # ifndef USE_AS_RAWMEMCHR |
306 | |
307 | .p2align 4 |
308 | L(first_vec_x1_check): |
309 | tzcntl %eax, %eax |
310 | /* Adjust length. */ |
311 | subl $-(VEC_SIZE * 4), %edx |
312 | /* Check if match within remaining length. */ |
313 | cmpl %eax, %edx |
314 | jbe L(set_zero_end) |
315 | incq %rdi |
316 | addq %rdi, %rax |
317 | VZEROUPPER_RETURN |
318 | .p2align 4,, 6 |
319 | L(set_zero_end): |
320 | xorl %eax, %eax |
321 | VZEROUPPER_RETURN |
322 | # endif |
323 | |
324 | .p2align 4 |
325 | L(last_vec_x1_return): |
326 | tzcntl %eax, %eax |
327 | # ifdef USE_AS_RAWMEMCHR |
328 | subq $(VEC_SIZE * 4 - 1), %rdi |
329 | # else |
330 | incq %rdi |
331 | # endif |
332 | addq %rdi, %rax |
333 | VZEROUPPER_RETURN |
334 | |
335 | .p2align 4 |
336 | L(last_vec_x2_return): |
337 | tzcntl %eax, %eax |
338 | # ifdef USE_AS_RAWMEMCHR |
339 | subq $(VEC_SIZE * 3 - 1), %rdi |
340 | # else |
341 | subq $-(VEC_SIZE + 1), %rdi |
342 | # endif |
343 | addq %rdi, %rax |
344 | VZEROUPPER_RETURN |
345 | |
346 | # ifndef USE_AS_RAWMEMCHR |
347 | .p2align 4 |
348 | L(last_4x_vec_or_less_cmpeq): |
349 | VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 |
350 | vpmovmskb %ymm1, %eax |
351 | # ifdef USE_AS_WMEMCHR |
352 | /* NB: Multiply length by 4 to get byte count. */ |
353 | sall $2, %edx |
354 | # endif |
355 | subq $-(VEC_SIZE * 4), %rdi |
356 | /* Check first VEC regardless. */ |
357 | testl %eax, %eax |
358 | jnz L(first_vec_x1_check) |
359 | |
360 | /* If remaining length <= CHAR_PER_VEC * 2. */ |
361 | addl $(VEC_SIZE * 2), %edx |
362 | jle L(last_2x_vec) |
363 | .p2align 4 |
364 | L(last_4x_vec): |
365 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
366 | vpmovmskb %ymm1, %eax |
367 | testl %eax, %eax |
368 | jnz L(last_vec_x2_return) |
369 | |
370 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |
371 | vpmovmskb %ymm1, %eax |
372 | |
373 | /* Create mask for possible matches within remaining length. */ |
374 | movq $-1, %rcx |
375 | bzhiq %rdx, %rcx, %rcx |
376 | |
377 | /* Test matches in data against length match. */ |
378 | andl %ecx, %eax |
379 | jnz L(last_vec_x3) |
380 | |
381 | /* if remaining length <= VEC_SIZE * 3 (Note this is after |
382 | remaining length was found to be > VEC_SIZE * 2. */ |
383 | subl $VEC_SIZE, %edx |
384 | jbe L(zero_end2) |
385 | |
386 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |
387 | vpmovmskb %ymm1, %eax |
388 | /* Shift remaining length mask for last VEC. */ |
389 | shrq $32, %rcx |
390 | andl %ecx, %eax |
391 | jz L(zero_end2) |
392 | tzcntl %eax, %eax |
393 | addq $(VEC_SIZE * 3 + 1), %rdi |
394 | addq %rdi, %rax |
395 | L(zero_end2): |
396 | VZEROUPPER_RETURN |
397 | |
398 | .p2align 4 |
399 | L(last_vec_x3): |
400 | tzcntl %eax, %eax |
401 | subq $-(VEC_SIZE * 2 + 1), %rdi |
402 | addq %rdi, %rax |
403 | VZEROUPPER_RETURN |
404 | # endif |
405 | |
406 | .p2align 4 |
407 | L(cross_page_boundary): |
408 | /* Save pointer before aligning as its original value is necessary for |
409 | computer return address if byte is found or adjusting length if it |
410 | is not and this is memchr. */ |
411 | movq %rdi, %rcx |
412 | /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr |
413 | and rdi for rawmemchr. */ |
414 | orq $(VEC_SIZE - 1), %ALGN_PTR_REG |
415 | VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 |
416 | vpmovmskb %ymm1, %eax |
417 | # ifndef USE_AS_RAWMEMCHR |
418 | /* Calculate length until end of page (length checked for a match). */ |
419 | leaq 1(%ALGN_PTR_REG), %rsi |
420 | subq %RRAW_PTR_REG, %rsi |
421 | # ifdef USE_AS_WMEMCHR |
422 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
423 | shrl $2, %esi |
424 | # endif |
425 | # endif |
426 | /* Remove the leading bytes. */ |
427 | sarxl %ERAW_PTR_REG, %eax, %eax |
428 | # ifndef USE_AS_RAWMEMCHR |
429 | /* Check the end of data. */ |
430 | cmpq %rsi, %rdx |
431 | jbe L(first_vec_x0) |
432 | # endif |
433 | testl %eax, %eax |
434 | jz L(cross_page_continue) |
435 | bsfl %eax, %eax |
436 | addq %RRAW_PTR_REG, %rax |
437 | VZEROUPPER_RETURN |
438 | |
439 | |
440 | END (MEMCHR) |
441 | #endif |
442 | |