1/* memchr/wmemchr optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCHR
24# define MEMCHR __memchr_avx2
25# endif
26
27# ifdef USE_AS_WMEMCHR
28# define VPCMPEQ vpcmpeqd
29# define VPBROADCAST vpbroadcastd
30# define CHAR_SIZE 4
31# else
32# define VPCMPEQ vpcmpeqb
33# define VPBROADCAST vpbroadcastb
34# define CHAR_SIZE 1
35# endif
36
37# ifdef USE_AS_RAWMEMCHR
38# define ERAW_PTR_REG ecx
39# define RRAW_PTR_REG rcx
40# define ALGN_PTR_REG rdi
41# else
42# define ERAW_PTR_REG edi
43# define RRAW_PTR_REG rdi
44# define ALGN_PTR_REG rcx
45# endif
46
47# ifndef VZEROUPPER
48# define VZEROUPPER vzeroupper
49# endif
50
51# ifndef SECTION
52# define SECTION(p) p##.avx
53# endif
54
55# define VEC_SIZE 32
56# define PAGE_SIZE 4096
57# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
58
59 .section SECTION(.text),"ax",@progbits
60ENTRY_P2ALIGN (MEMCHR, 5)
61# ifndef USE_AS_RAWMEMCHR
62 /* Check for zero length. */
63# ifdef __ILP32__
64 /* Clear upper bits. */
65 and %RDX_LP, %RDX_LP
66# else
67 test %RDX_LP, %RDX_LP
68# endif
69 jz L(null)
70# endif
71 /* Broadcast CHAR to YMMMATCH. */
72 vmovd %esi, %xmm0
73 VPBROADCAST %xmm0, %ymm0
74 /* Check if we may cross page boundary with one vector load. */
75 movl %edi, %eax
76 andl $(PAGE_SIZE - 1), %eax
77 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
78 ja L(cross_page_boundary)
79
80 /* Check the first VEC_SIZE bytes. */
81 VPCMPEQ (%rdi), %ymm0, %ymm1
82 vpmovmskb %ymm1, %eax
83# ifndef USE_AS_RAWMEMCHR
84 /* If length < CHAR_PER_VEC handle special. */
85 cmpq $CHAR_PER_VEC, %rdx
86 jbe L(first_vec_x0)
87# endif
88 testl %eax, %eax
89 jz L(aligned_more)
90 bsfl %eax, %eax
91 addq %rdi, %rax
92L(return_vzeroupper):
93 ZERO_UPPER_VEC_REGISTERS_RETURN
94
95
96# ifndef USE_AS_RAWMEMCHR
97 .p2align 4
98L(first_vec_x0):
99 /* Check if first match was before length. */
100 tzcntl %eax, %eax
101# ifdef USE_AS_WMEMCHR
102 /* NB: Multiply length by 4 to get byte count. */
103 sall $2, %edx
104# endif
105 COND_VZEROUPPER
106 /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
107 block. branch here as opposed to cmovcc is not that costly. Common
108 usage of memchr is to check if the return was NULL (if string was
109 known to contain CHAR user would use rawmemchr). This branch will be
110 highly correlated with the user branch and can be used by most
111 modern branch predictors to predict the user branch. */
112 cmpl %eax, %edx
113 jle L(null)
114 addq %rdi, %rax
115 ret
116# endif
117
118 .p2align 4,, 10
119L(first_vec_x1):
120 bsfl %eax, %eax
121 incq %rdi
122 addq %rdi, %rax
123 VZEROUPPER_RETURN
124# ifndef USE_AS_RAWMEMCHR
125 /* First in aligning bytes here. */
126L(null):
127 xorl %eax, %eax
128 ret
129# endif
130 .p2align 4
131L(first_vec_x2):
132 tzcntl %eax, %eax
133 addq $(VEC_SIZE + 1), %rdi
134 addq %rdi, %rax
135 VZEROUPPER_RETURN
136
137 .p2align 4
138L(first_vec_x3):
139 tzcntl %eax, %eax
140 addq $(VEC_SIZE * 2 + 1), %rdi
141 addq %rdi, %rax
142 VZEROUPPER_RETURN
143
144
145 .p2align 4
146L(first_vec_x4):
147 tzcntl %eax, %eax
148 addq $(VEC_SIZE * 3 + 1), %rdi
149 addq %rdi, %rax
150 VZEROUPPER_RETURN
151
152 .p2align 4
153L(aligned_more):
154 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
155 since data is only aligned to VEC_SIZE. */
156
157# ifndef USE_AS_RAWMEMCHR
158L(cross_page_continue):
159 /* Align data to VEC_SIZE - 1. */
160 xorl %ecx, %ecx
161 subl %edi, %ecx
162 orq $(VEC_SIZE - 1), %rdi
163 /* esi is for adjusting length to see if near the end. */
164 leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
165# ifdef USE_AS_WMEMCHR
166 /* NB: Divide bytes by 4 to get the wchar_t count. */
167 sarl $2, %esi
168# endif
169# else
170 orq $(VEC_SIZE - 1), %rdi
171L(cross_page_continue):
172# endif
173 /* Load first VEC regardless. */
174 VPCMPEQ 1(%rdi), %ymm0, %ymm1
175 vpmovmskb %ymm1, %eax
176# ifndef USE_AS_RAWMEMCHR
177 /* Adjust length. If near end handle specially. */
178 subq %rsi, %rdx
179 jbe L(last_4x_vec_or_less)
180# endif
181 testl %eax, %eax
182 jnz L(first_vec_x1)
183
184 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
185 vpmovmskb %ymm1, %eax
186 testl %eax, %eax
187 jnz L(first_vec_x2)
188
189 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
190 vpmovmskb %ymm1, %eax
191 testl %eax, %eax
192 jnz L(first_vec_x3)
193
194 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
195 vpmovmskb %ymm1, %eax
196 testl %eax, %eax
197 jnz L(first_vec_x4)
198
199# ifndef USE_AS_RAWMEMCHR
200 /* Check if at last VEC_SIZE * 4 length. */
201 subq $(CHAR_PER_VEC * 4), %rdx
202 jbe L(last_4x_vec_or_less_cmpeq)
203 /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
204 length. */
205 incq %rdi
206 movl %edi, %ecx
207 orq $(VEC_SIZE * 4 - 1), %rdi
208 andl $(VEC_SIZE * 4 - 1), %ecx
209# ifdef USE_AS_WMEMCHR
210 /* NB: Divide bytes by 4 to get the wchar_t count. */
211 sarl $2, %ecx
212# endif
213 addq %rcx, %rdx
214# else
215 /* Align data to VEC_SIZE * 4 - 1 for loop. */
216 incq %rdi
217 orq $(VEC_SIZE * 4 - 1), %rdi
218# endif
219
220 /* Compare 4 * VEC at a time forward. */
221 .p2align 4
222L(loop_4x_vec):
223 VPCMPEQ 1(%rdi), %ymm0, %ymm1
224 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
225 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
226 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
227 vpor %ymm1, %ymm2, %ymm5
228 vpor %ymm3, %ymm4, %ymm6
229 vpor %ymm5, %ymm6, %ymm5
230
231 vpmovmskb %ymm5, %ecx
232# ifdef USE_AS_RAWMEMCHR
233 subq $-(VEC_SIZE * 4), %rdi
234 testl %ecx, %ecx
235 jz L(loop_4x_vec)
236# else
237 testl %ecx, %ecx
238 jnz L(loop_4x_vec_end)
239
240 subq $-(VEC_SIZE * 4), %rdi
241
242 subq $(CHAR_PER_VEC * 4), %rdx
243 ja L(loop_4x_vec)
244
245 /* Fall through into less than 4 remaining vectors of length
246 case. */
247 VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
248 vpmovmskb %ymm1, %eax
249 .p2align 4
250L(last_4x_vec_or_less):
251# ifdef USE_AS_WMEMCHR
252 /* NB: Multiply length by 4 to get byte count. */
253 sall $2, %edx
254# endif
255 /* Check if first VEC contained match. */
256 testl %eax, %eax
257 jnz L(first_vec_x1_check)
258
259 /* If remaining length > VEC_SIZE * 2. */
260 addl $(VEC_SIZE * 2), %edx
261 jg L(last_4x_vec)
262
263L(last_2x_vec):
264 /* If remaining length < VEC_SIZE. */
265 addl $VEC_SIZE, %edx
266 jle L(zero_end)
267
268 /* Check VEC2 and compare any match with remaining length. */
269 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
270 vpmovmskb %ymm1, %eax
271 tzcntl %eax, %eax
272 cmpl %eax, %edx
273 jbe L(set_zero_end)
274 addq $(VEC_SIZE + 1), %rdi
275 addq %rdi, %rax
276L(zero_end):
277 VZEROUPPER_RETURN
278
279 .p2align 4
280L(loop_4x_vec_end):
281# endif
282 /* rawmemchr will fall through into this if match was found in
283 loop. */
284
285 vpmovmskb %ymm1, %eax
286 testl %eax, %eax
287 jnz L(last_vec_x1_return)
288
289 vpmovmskb %ymm2, %eax
290 testl %eax, %eax
291 jnz L(last_vec_x2_return)
292
293 vpmovmskb %ymm3, %eax
294 /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
295 salq $32, %rcx
296 orq %rcx, %rax
297 tzcntq %rax, %rax
298# ifdef USE_AS_RAWMEMCHR
299 subq $(VEC_SIZE * 2 - 1), %rdi
300# else
301 subq $-(VEC_SIZE * 2 + 1), %rdi
302# endif
303 addq %rdi, %rax
304 VZEROUPPER_RETURN
305# ifndef USE_AS_RAWMEMCHR
306
307 .p2align 4
308L(first_vec_x1_check):
309 tzcntl %eax, %eax
310 /* Adjust length. */
311 subl $-(VEC_SIZE * 4), %edx
312 /* Check if match within remaining length. */
313 cmpl %eax, %edx
314 jbe L(set_zero_end)
315 incq %rdi
316 addq %rdi, %rax
317 VZEROUPPER_RETURN
318 .p2align 4,, 6
319L(set_zero_end):
320 xorl %eax, %eax
321 VZEROUPPER_RETURN
322# endif
323
324 .p2align 4
325L(last_vec_x1_return):
326 tzcntl %eax, %eax
327# ifdef USE_AS_RAWMEMCHR
328 subq $(VEC_SIZE * 4 - 1), %rdi
329# else
330 incq %rdi
331# endif
332 addq %rdi, %rax
333 VZEROUPPER_RETURN
334
335 .p2align 4
336L(last_vec_x2_return):
337 tzcntl %eax, %eax
338# ifdef USE_AS_RAWMEMCHR
339 subq $(VEC_SIZE * 3 - 1), %rdi
340# else
341 subq $-(VEC_SIZE + 1), %rdi
342# endif
343 addq %rdi, %rax
344 VZEROUPPER_RETURN
345
346# ifndef USE_AS_RAWMEMCHR
347 .p2align 4
348L(last_4x_vec_or_less_cmpeq):
349 VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
350 vpmovmskb %ymm1, %eax
351# ifdef USE_AS_WMEMCHR
352 /* NB: Multiply length by 4 to get byte count. */
353 sall $2, %edx
354# endif
355 subq $-(VEC_SIZE * 4), %rdi
356 /* Check first VEC regardless. */
357 testl %eax, %eax
358 jnz L(first_vec_x1_check)
359
360 /* If remaining length <= CHAR_PER_VEC * 2. */
361 addl $(VEC_SIZE * 2), %edx
362 jle L(last_2x_vec)
363 .p2align 4
364L(last_4x_vec):
365 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
366 vpmovmskb %ymm1, %eax
367 testl %eax, %eax
368 jnz L(last_vec_x2_return)
369
370 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
371 vpmovmskb %ymm1, %eax
372
373 /* Create mask for possible matches within remaining length. */
374 movq $-1, %rcx
375 bzhiq %rdx, %rcx, %rcx
376
377 /* Test matches in data against length match. */
378 andl %ecx, %eax
379 jnz L(last_vec_x3)
380
381 /* if remaining length <= VEC_SIZE * 3 (Note this is after
382 remaining length was found to be > VEC_SIZE * 2. */
383 subl $VEC_SIZE, %edx
384 jbe L(zero_end2)
385
386 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
387 vpmovmskb %ymm1, %eax
388 /* Shift remaining length mask for last VEC. */
389 shrq $32, %rcx
390 andl %ecx, %eax
391 jz L(zero_end2)
392 tzcntl %eax, %eax
393 addq $(VEC_SIZE * 3 + 1), %rdi
394 addq %rdi, %rax
395L(zero_end2):
396 VZEROUPPER_RETURN
397
398 .p2align 4
399L(last_vec_x3):
400 tzcntl %eax, %eax
401 subq $-(VEC_SIZE * 2 + 1), %rdi
402 addq %rdi, %rax
403 VZEROUPPER_RETURN
404# endif
405
406 .p2align 4
407L(cross_page_boundary):
408 /* Save pointer before aligning as its original value is necessary for
409 computer return address if byte is found or adjusting length if it
410 is not and this is memchr. */
411 movq %rdi, %rcx
412 /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
413 and rdi for rawmemchr. */
414 orq $(VEC_SIZE - 1), %ALGN_PTR_REG
415 VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
416 vpmovmskb %ymm1, %eax
417# ifndef USE_AS_RAWMEMCHR
418 /* Calculate length until end of page (length checked for a match). */
419 leaq 1(%ALGN_PTR_REG), %rsi
420 subq %RRAW_PTR_REG, %rsi
421# ifdef USE_AS_WMEMCHR
422 /* NB: Divide bytes by 4 to get wchar_t count. */
423 shrl $2, %esi
424# endif
425# endif
426 /* Remove the leading bytes. */
427 sarxl %ERAW_PTR_REG, %eax, %eax
428# ifndef USE_AS_RAWMEMCHR
429 /* Check the end of data. */
430 cmpq %rsi, %rdx
431 jbe L(first_vec_x0)
432# endif
433 testl %eax, %eax
434 jz L(cross_page_continue)
435 bsfl %eax, %eax
436 addq %RRAW_PTR_REG, %rax
437 VZEROUPPER_RETURN
438
439
440END (MEMCHR)
441#endif
442

source code of glibc/sysdeps/x86_64/multiarch/memchr-avx2.S