1/* strchr/strchrnul optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCHR
24# define STRCHR __strchr_avx2
25# endif
26
27# ifdef USE_AS_WCSCHR
28# define VPBROADCAST vpbroadcastd
29# define VPCMPEQ vpcmpeqd
30# define VPMINU vpminud
31# define CHAR_REG esi
32# else
33# define VPBROADCAST vpbroadcastb
34# define VPCMPEQ vpcmpeqb
35# define VPMINU vpminub
36# define CHAR_REG sil
37# endif
38
39# ifndef VZEROUPPER
40# define VZEROUPPER vzeroupper
41# endif
42
43# ifndef SECTION
44# define SECTION(p) p##.avx
45# endif
46
47# define VEC_SIZE 32
48# define PAGE_SIZE 4096
49
50 .section SECTION(.text),"ax",@progbits
51ENTRY_P2ALIGN (STRCHR, 5)
52 /* Broadcast CHAR to YMM0. */
53 vmovd %esi, %xmm0
54 movl %edi, %eax
55 andl $(PAGE_SIZE - 1), %eax
56 VPBROADCAST %xmm0, %ymm0
57 vpxor %xmm1, %xmm1, %xmm1
58
59 /* Check if we cross page boundary with one vector load. */
60 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
61 ja L(cross_page_boundary)
62
63 /* Check the first VEC_SIZE bytes. Search for both CHAR and the
64 null byte. */
65 vmovdqu (%rdi), %ymm2
66 VPCMPEQ %ymm2, %ymm0, %ymm3
67 VPCMPEQ %ymm2, %ymm1, %ymm2
68 vpor %ymm3, %ymm2, %ymm3
69 vpmovmskb %ymm3, %eax
70 testl %eax, %eax
71 jz L(aligned_more)
72 tzcntl %eax, %eax
73# ifndef USE_AS_STRCHRNUL
74 /* Found CHAR or the null byte. */
75 cmp (%rdi, %rax), %CHAR_REG
76 /* NB: Use a branch instead of cmovcc here. The expectation is
77 that with strchr the user will branch based on input being
78 null. Since this branch will be 100% predictive of the user
79 branch a branch miss here should save what otherwise would
80 be branch miss in the user code. Otherwise using a branch 1)
81 saves code size and 2) is faster in highly predictable
82 environments. */
83 jne L(zero)
84# endif
85 addq %rdi, %rax
86L(return_vzeroupper):
87 ZERO_UPPER_VEC_REGISTERS_RETURN
88
89# ifndef USE_AS_STRCHRNUL
90L(zero):
91 xorl %eax, %eax
92 VZEROUPPER_RETURN
93# endif
94
95
96 .p2align 4
97L(first_vec_x1):
98 /* Use bsf to save code size. */
99 bsfl %eax, %eax
100 incq %rdi
101# ifndef USE_AS_STRCHRNUL
102 /* Found CHAR or the null byte. */
103 cmp (%rdi, %rax), %CHAR_REG
104 jne L(zero)
105# endif
106 addq %rdi, %rax
107 VZEROUPPER_RETURN
108
109 .p2align 4,, 10
110L(first_vec_x2):
111 /* Use bsf to save code size. */
112 bsfl %eax, %eax
113 addq $(VEC_SIZE + 1), %rdi
114# ifndef USE_AS_STRCHRNUL
115 /* Found CHAR or the null byte. */
116 cmp (%rdi, %rax), %CHAR_REG
117 jne L(zero)
118# endif
119 addq %rdi, %rax
120 VZEROUPPER_RETURN
121
122 .p2align 4,, 8
123L(first_vec_x3):
124 /* Use bsf to save code size. */
125 bsfl %eax, %eax
126 addq $(VEC_SIZE * 2 + 1), %rdi
127# ifndef USE_AS_STRCHRNUL
128 /* Found CHAR or the null byte. */
129 cmp (%rdi, %rax), %CHAR_REG
130 jne L(zero)
131# endif
132 addq %rdi, %rax
133 VZEROUPPER_RETURN
134
135 .p2align 4,, 10
136L(first_vec_x4):
137 /* Use bsf to save code size. */
138 bsfl %eax, %eax
139 addq $(VEC_SIZE * 3 + 1), %rdi
140# ifndef USE_AS_STRCHRNUL
141 /* Found CHAR or the null byte. */
142 cmp (%rdi, %rax), %CHAR_REG
143 jne L(zero)
144# endif
145 addq %rdi, %rax
146 VZEROUPPER_RETURN
147
148
149
150 .p2align 4
151L(aligned_more):
152 /* Align data to VEC_SIZE - 1. This is the same number of
153 instructions as using andq -VEC_SIZE but saves 4 bytes of code
154 on x4 check. */
155 orq $(VEC_SIZE - 1), %rdi
156L(cross_page_continue):
157 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
158 since data is only aligned to VEC_SIZE. */
159 vmovdqa 1(%rdi), %ymm2
160 VPCMPEQ %ymm2, %ymm0, %ymm3
161 VPCMPEQ %ymm2, %ymm1, %ymm2
162 vpor %ymm3, %ymm2, %ymm3
163 vpmovmskb %ymm3, %eax
164 testl %eax, %eax
165 jnz L(first_vec_x1)
166
167 vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
168 VPCMPEQ %ymm2, %ymm0, %ymm3
169 VPCMPEQ %ymm2, %ymm1, %ymm2
170 vpor %ymm3, %ymm2, %ymm3
171 vpmovmskb %ymm3, %eax
172 testl %eax, %eax
173 jnz L(first_vec_x2)
174
175 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
176 VPCMPEQ %ymm2, %ymm0, %ymm3
177 VPCMPEQ %ymm2, %ymm1, %ymm2
178 vpor %ymm3, %ymm2, %ymm3
179 vpmovmskb %ymm3, %eax
180 testl %eax, %eax
181 jnz L(first_vec_x3)
182
183 vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
184 VPCMPEQ %ymm2, %ymm0, %ymm3
185 VPCMPEQ %ymm2, %ymm1, %ymm2
186 vpor %ymm3, %ymm2, %ymm3
187 vpmovmskb %ymm3, %eax
188 testl %eax, %eax
189 jnz L(first_vec_x4)
190 /* Align data to VEC_SIZE * 4 - 1. */
191 incq %rdi
192 orq $(VEC_SIZE * 4 - 1), %rdi
193 .p2align 4
194L(loop_4x_vec):
195 /* Compare 4 * VEC at a time forward. */
196 vmovdqa 1(%rdi), %ymm6
197 vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
198
199 /* Leaves only CHARS matching esi as 0. */
200 vpxor %ymm6, %ymm0, %ymm2
201 vpxor %ymm7, %ymm0, %ymm3
202
203 VPMINU %ymm2, %ymm6, %ymm2
204 VPMINU %ymm3, %ymm7, %ymm3
205
206 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
207 vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
208
209 vpxor %ymm6, %ymm0, %ymm4
210 vpxor %ymm7, %ymm0, %ymm5
211
212 VPMINU %ymm4, %ymm6, %ymm4
213 VPMINU %ymm5, %ymm7, %ymm5
214
215 VPMINU %ymm2, %ymm3, %ymm6
216 VPMINU %ymm4, %ymm5, %ymm7
217
218 VPMINU %ymm6, %ymm7, %ymm7
219
220 VPCMPEQ %ymm7, %ymm1, %ymm7
221 vpmovmskb %ymm7, %ecx
222 subq $-(VEC_SIZE * 4), %rdi
223 testl %ecx, %ecx
224 jz L(loop_4x_vec)
225
226 VPCMPEQ %ymm2, %ymm1, %ymm2
227 vpmovmskb %ymm2, %eax
228 testl %eax, %eax
229 jnz L(last_vec_x0)
230
231
232 VPCMPEQ %ymm3, %ymm1, %ymm3
233 vpmovmskb %ymm3, %eax
234 testl %eax, %eax
235 jnz L(last_vec_x1)
236
237 VPCMPEQ %ymm4, %ymm1, %ymm4
238 vpmovmskb %ymm4, %eax
239 /* rcx has combined result from all 4 VEC. It will only be used
240 if the first 3 other VEC all did not contain a match. */
241 salq $32, %rcx
242 orq %rcx, %rax
243 tzcntq %rax, %rax
244 subq $(VEC_SIZE * 2 - 1), %rdi
245# ifndef USE_AS_STRCHRNUL
246 /* Found CHAR or the null byte. */
247 cmp (%rdi, %rax), %CHAR_REG
248 jne L(zero_end)
249# endif
250 addq %rdi, %rax
251 VZEROUPPER_RETURN
252
253
254 .p2align 4,, 10
255L(last_vec_x0):
256 /* Use bsf to save code size. */
257 bsfl %eax, %eax
258 addq $-(VEC_SIZE * 4 - 1), %rdi
259# ifndef USE_AS_STRCHRNUL
260 /* Found CHAR or the null byte. */
261 cmp (%rdi, %rax), %CHAR_REG
262 jne L(zero_end)
263# endif
264 addq %rdi, %rax
265 VZEROUPPER_RETURN
266
267
268 .p2align 4,, 10
269L(last_vec_x1):
270 tzcntl %eax, %eax
271 subq $(VEC_SIZE * 3 - 1), %rdi
272# ifndef USE_AS_STRCHRNUL
273 /* Found CHAR or the null byte. */
274 cmp (%rdi, %rax), %CHAR_REG
275 jne L(zero_end)
276# endif
277 addq %rdi, %rax
278 VZEROUPPER_RETURN
279
280# ifndef USE_AS_STRCHRNUL
281L(zero_end):
282 xorl %eax, %eax
283 VZEROUPPER_RETURN
284# endif
285
286 /* Cold case for crossing page with first load. */
287 .p2align 4,, 8
288L(cross_page_boundary):
289 movq %rdi, %rdx
290 /* Align rdi to VEC_SIZE - 1. */
291 orq $(VEC_SIZE - 1), %rdi
292 vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
293 VPCMPEQ %ymm2, %ymm0, %ymm3
294 VPCMPEQ %ymm2, %ymm1, %ymm2
295 vpor %ymm3, %ymm2, %ymm3
296 vpmovmskb %ymm3, %eax
297 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
298 so no need to manually mod edx. */
299 sarxl %edx, %eax, %eax
300 testl %eax, %eax
301 jz L(cross_page_continue)
302 tzcntl %eax, %eax
303# ifndef USE_AS_STRCHRNUL
304 xorl %ecx, %ecx
305 /* Found CHAR or the null byte. */
306 cmp (%rdx, %rax), %CHAR_REG
307 jne L(zero_end)
308# endif
309 addq %rdx, %rax
310 VZEROUPPER_RETURN
311
312END (STRCHR)
313#endif
314

source code of glibc/sysdeps/x86_64/multiarch/strchr-avx2.S