1 | /* strrchr/wcsrchr optimized with AVX2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRRCHR |
24 | # define STRRCHR __strrchr_avx2 |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WCSRCHR |
28 | # define VPBROADCAST vpbroadcastd |
29 | # define VPCMPEQ vpcmpeqd |
30 | # define VPMIN vpminud |
31 | # define CHAR_SIZE 4 |
32 | # else |
33 | # define VPBROADCAST vpbroadcastb |
34 | # define VPCMPEQ vpcmpeqb |
35 | # define VPMIN vpminub |
36 | # define CHAR_SIZE 1 |
37 | # endif |
38 | |
39 | # ifndef VZEROUPPER |
40 | # define VZEROUPPER vzeroupper |
41 | # endif |
42 | |
43 | # ifndef SECTION |
44 | # define SECTION(p) p##.avx |
45 | # endif |
46 | |
47 | # define VEC_SIZE 32 |
48 | # define PAGE_SIZE 4096 |
49 | |
50 | .section SECTION(.text), "ax" , @progbits |
51 | ENTRY(STRRCHR) |
52 | movd %esi, %xmm7 |
53 | movl %edi, %eax |
54 | /* Broadcast CHAR to YMM4. */ |
55 | VPBROADCAST %xmm7, %ymm7 |
56 | vpxor %xmm0, %xmm0, %xmm0 |
57 | |
58 | /* Shift here instead of `andl` to save code size (saves a fetch |
59 | block). */ |
60 | sall $20, %eax |
61 | cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax |
62 | ja L(cross_page) |
63 | |
64 | L(page_cross_continue): |
65 | vmovdqu (%rdi), %ymm1 |
66 | /* Check end of string match. */ |
67 | VPCMPEQ %ymm1, %ymm0, %ymm6 |
68 | vpmovmskb %ymm6, %ecx |
69 | testl %ecx, %ecx |
70 | jz L(aligned_more) |
71 | |
72 | /* Only check match with search CHAR if needed. */ |
73 | VPCMPEQ %ymm1, %ymm7, %ymm1 |
74 | vpmovmskb %ymm1, %eax |
75 | /* Check if match before first zero. */ |
76 | blsmskl %ecx, %ecx |
77 | andl %ecx, %eax |
78 | jz L(ret0) |
79 | bsrl %eax, %eax |
80 | addq %rdi, %rax |
81 | /* We are off by 3 for wcsrchr if search CHAR is non-zero. If |
82 | search CHAR is zero we are correct. Either way `andq |
83 | -CHAR_SIZE, %rax` gets the correct result. */ |
84 | # ifdef USE_AS_WCSRCHR |
85 | andq $-CHAR_SIZE, %rax |
86 | # endif |
87 | L(ret0): |
88 | L(return_vzeroupper): |
89 | ZERO_UPPER_VEC_REGISTERS_RETURN |
90 | |
91 | /* Returns for first vec x1/x2 have hard coded backward search |
92 | path for earlier matches. */ |
93 | .p2align 4,, 10 |
94 | L(first_vec_x1): |
95 | VPCMPEQ %ymm2, %ymm7, %ymm6 |
96 | vpmovmskb %ymm6, %eax |
97 | blsmskl %ecx, %ecx |
98 | andl %ecx, %eax |
99 | jnz L(first_vec_x1_return) |
100 | |
101 | .p2align 4,, 4 |
102 | L(first_vec_x0_test): |
103 | VPCMPEQ %ymm1, %ymm7, %ymm6 |
104 | vpmovmskb %ymm6, %eax |
105 | testl %eax, %eax |
106 | jz L(ret1) |
107 | bsrl %eax, %eax |
108 | addq %r8, %rax |
109 | # ifdef USE_AS_WCSRCHR |
110 | andq $-CHAR_SIZE, %rax |
111 | # endif |
112 | L(ret1): |
113 | VZEROUPPER_RETURN |
114 | |
115 | .p2align 4,, 10 |
116 | L(first_vec_x0_x1_test): |
117 | VPCMPEQ %ymm2, %ymm7, %ymm6 |
118 | vpmovmskb %ymm6, %eax |
119 | /* Check ymm2 for search CHAR match. If no match then check ymm1 |
120 | before returning. */ |
121 | testl %eax, %eax |
122 | jz L(first_vec_x0_test) |
123 | .p2align 4,, 4 |
124 | L(first_vec_x1_return): |
125 | bsrl %eax, %eax |
126 | leaq 1(%rdi, %rax), %rax |
127 | # ifdef USE_AS_WCSRCHR |
128 | andq $-CHAR_SIZE, %rax |
129 | # endif |
130 | VZEROUPPER_RETURN |
131 | |
132 | |
133 | .p2align 4,, 10 |
134 | L(first_vec_x2): |
135 | VPCMPEQ %ymm3, %ymm7, %ymm6 |
136 | vpmovmskb %ymm6, %eax |
137 | blsmskl %ecx, %ecx |
138 | /* If no in-range search CHAR match in ymm3 then need to check |
139 | ymm1/ymm2 for an earlier match (we delay checking search |
140 | CHAR matches until needed). */ |
141 | andl %ecx, %eax |
142 | jz L(first_vec_x0_x1_test) |
143 | bsrl %eax, %eax |
144 | leaq (VEC_SIZE + 1)(%rdi, %rax), %rax |
145 | # ifdef USE_AS_WCSRCHR |
146 | andq $-CHAR_SIZE, %rax |
147 | # endif |
148 | VZEROUPPER_RETURN |
149 | |
150 | |
151 | .p2align 4 |
152 | L(aligned_more): |
153 | /* Save original pointer if match was in VEC 0. */ |
154 | movq %rdi, %r8 |
155 | |
156 | /* Align src. */ |
157 | orq $(VEC_SIZE - 1), %rdi |
158 | vmovdqu 1(%rdi), %ymm2 |
159 | VPCMPEQ %ymm2, %ymm0, %ymm6 |
160 | vpmovmskb %ymm6, %ecx |
161 | testl %ecx, %ecx |
162 | jnz L(first_vec_x1) |
163 | |
164 | vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 |
165 | VPCMPEQ %ymm3, %ymm0, %ymm6 |
166 | vpmovmskb %ymm6, %ecx |
167 | testl %ecx, %ecx |
168 | jnz L(first_vec_x2) |
169 | |
170 | /* Save pointer again before realigning. */ |
171 | movq %rdi, %rsi |
172 | addq $(VEC_SIZE + 1), %rdi |
173 | andq $-(VEC_SIZE * 2), %rdi |
174 | .p2align 4 |
175 | L(first_aligned_loop): |
176 | /* Do 2x VEC at a time. Any more and the cost of finding the |
177 | match outweights loop benefit. */ |
178 | vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 |
179 | vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 |
180 | |
181 | VPCMPEQ %ymm4, %ymm7, %ymm6 |
182 | VPMIN %ymm4, %ymm5, %ymm8 |
183 | VPCMPEQ %ymm5, %ymm7, %ymm10 |
184 | vpor %ymm6, %ymm10, %ymm5 |
185 | VPCMPEQ %ymm8, %ymm0, %ymm8 |
186 | vpor %ymm5, %ymm8, %ymm9 |
187 | |
188 | vpmovmskb %ymm9, %eax |
189 | addq $(VEC_SIZE * 2), %rdi |
190 | /* No zero or search CHAR. */ |
191 | testl %eax, %eax |
192 | jz L(first_aligned_loop) |
193 | |
194 | /* If no zero CHAR then go to second loop (this allows us to |
195 | throw away all prior work). */ |
196 | vpmovmskb %ymm8, %ecx |
197 | testl %ecx, %ecx |
198 | jz L(second_aligned_loop_prep) |
199 | |
200 | /* Search char could be zero so we need to get the true match. |
201 | */ |
202 | vpmovmskb %ymm5, %eax |
203 | testl %eax, %eax |
204 | jnz L(first_aligned_loop_return) |
205 | |
206 | .p2align 4,, 4 |
207 | L(first_vec_x1_or_x2): |
208 | VPCMPEQ %ymm3, %ymm7, %ymm3 |
209 | VPCMPEQ %ymm2, %ymm7, %ymm2 |
210 | vpmovmskb %ymm3, %eax |
211 | vpmovmskb %ymm2, %edx |
212 | /* Use add for macro-fusion. */ |
213 | addq %rax, %rdx |
214 | jz L(first_vec_x0_test) |
215 | /* NB: We could move this shift to before the branch and save a |
216 | bit of code size / performance on the fall through. The |
217 | branch leads to the null case which generally seems hotter |
218 | than char in first 3x VEC. */ |
219 | salq $32, %rax |
220 | addq %rdx, %rax |
221 | bsrq %rax, %rax |
222 | leaq 1(%rsi, %rax), %rax |
223 | # ifdef USE_AS_WCSRCHR |
224 | andq $-CHAR_SIZE, %rax |
225 | # endif |
226 | VZEROUPPER_RETURN |
227 | |
228 | .p2align 4,, 8 |
229 | L(first_aligned_loop_return): |
230 | VPCMPEQ %ymm4, %ymm0, %ymm4 |
231 | vpmovmskb %ymm4, %edx |
232 | salq $32, %rcx |
233 | orq %rdx, %rcx |
234 | |
235 | vpmovmskb %ymm10, %eax |
236 | vpmovmskb %ymm6, %edx |
237 | salq $32, %rax |
238 | orq %rdx, %rax |
239 | blsmskq %rcx, %rcx |
240 | andq %rcx, %rax |
241 | jz L(first_vec_x1_or_x2) |
242 | |
243 | bsrq %rax, %rax |
244 | leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax |
245 | # ifdef USE_AS_WCSRCHR |
246 | andq $-CHAR_SIZE, %rax |
247 | # endif |
248 | VZEROUPPER_RETURN |
249 | |
250 | /* Search char cannot be zero. */ |
251 | .p2align 4 |
252 | L(second_aligned_loop_set_furthest_match): |
253 | /* Save VEC and pointer from most recent match. */ |
254 | L(second_aligned_loop_prep): |
255 | movq %rdi, %rsi |
256 | vmovdqu %ymm6, %ymm2 |
257 | vmovdqu %ymm10, %ymm3 |
258 | |
259 | .p2align 4 |
260 | L(second_aligned_loop): |
261 | /* Search 2x at at time. */ |
262 | vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 |
263 | vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 |
264 | |
265 | VPCMPEQ %ymm4, %ymm7, %ymm6 |
266 | VPMIN %ymm4, %ymm5, %ymm1 |
267 | VPCMPEQ %ymm5, %ymm7, %ymm10 |
268 | vpor %ymm6, %ymm10, %ymm5 |
269 | VPCMPEQ %ymm1, %ymm0, %ymm1 |
270 | vpor %ymm5, %ymm1, %ymm9 |
271 | |
272 | vpmovmskb %ymm9, %eax |
273 | addq $(VEC_SIZE * 2), %rdi |
274 | testl %eax, %eax |
275 | jz L(second_aligned_loop) |
276 | vpmovmskb %ymm1, %ecx |
277 | testl %ecx, %ecx |
278 | jz L(second_aligned_loop_set_furthest_match) |
279 | vpmovmskb %ymm5, %eax |
280 | testl %eax, %eax |
281 | jnz L(return_new_match) |
282 | |
283 | /* This is the hot patch. We know CHAR is inbounds and that |
284 | ymm3/ymm2 have latest match. */ |
285 | .p2align 4,, 4 |
286 | L(return_old_match): |
287 | vpmovmskb %ymm3, %eax |
288 | vpmovmskb %ymm2, %edx |
289 | salq $32, %rax |
290 | orq %rdx, %rax |
291 | bsrq %rax, %rax |
292 | /* Search char cannot be zero so safe to just use lea for |
293 | wcsrchr. */ |
294 | leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax |
295 | VZEROUPPER_RETURN |
296 | |
297 | /* Last iteration also potentially has a match. */ |
298 | .p2align 4,, 8 |
299 | L(return_new_match): |
300 | VPCMPEQ %ymm4, %ymm0, %ymm4 |
301 | vpmovmskb %ymm4, %edx |
302 | salq $32, %rcx |
303 | orq %rdx, %rcx |
304 | |
305 | vpmovmskb %ymm10, %eax |
306 | vpmovmskb %ymm6, %edx |
307 | salq $32, %rax |
308 | orq %rdx, %rax |
309 | blsmskq %rcx, %rcx |
310 | andq %rcx, %rax |
311 | jz L(return_old_match) |
312 | bsrq %rax, %rax |
313 | /* Search char cannot be zero so safe to just use lea for |
314 | wcsrchr. */ |
315 | leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax |
316 | VZEROUPPER_RETURN |
317 | |
318 | .p2align 4,, 4 |
319 | L(cross_page): |
320 | movq %rdi, %rsi |
321 | andq $-VEC_SIZE, %rsi |
322 | vmovdqu (%rsi), %ymm1 |
323 | VPCMPEQ %ymm1, %ymm0, %ymm6 |
324 | vpmovmskb %ymm6, %ecx |
325 | /* Shift out zero CHAR matches that are before the begining of |
326 | src (rdi). */ |
327 | shrxl %edi, %ecx, %ecx |
328 | testl %ecx, %ecx |
329 | jz L(page_cross_continue) |
330 | VPCMPEQ %ymm1, %ymm7, %ymm1 |
331 | vpmovmskb %ymm1, %eax |
332 | |
333 | /* Shift out search CHAR matches that are before the begining of |
334 | src (rdi). */ |
335 | shrxl %edi, %eax, %eax |
336 | blsmskl %ecx, %ecx |
337 | /* Check if any search CHAR match in range. */ |
338 | andl %ecx, %eax |
339 | jz L(ret2) |
340 | bsrl %eax, %eax |
341 | addq %rdi, %rax |
342 | # ifdef USE_AS_WCSRCHR |
343 | andq $-CHAR_SIZE, %rax |
344 | # endif |
345 | L(ret2): |
346 | VZEROUPPER_RETURN |
347 | END(STRRCHR) |
348 | #endif |
349 | |