1/* Implementation for strrchr using evex256 and evex512.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24
25# ifdef USE_AS_WCSRCHR
26# if VEC_SIZE == 64
27# define RCX_M cx
28# define KORTEST_M kortestw
29# else
30# define RCX_M cl
31# define KORTEST_M kortestb
32# endif
33
34# define SHIFT_REG VRCX
35# define CHAR_SIZE 4
36# define VPCMP vpcmpd
37# define VPMIN vpminud
38# define VPTESTN vptestnmd
39# define VPTEST vptestmd
40# define VPBROADCAST vpbroadcastd
41# define VPCMPEQ vpcmpeqd
42
43# else
44# if VEC_SIZE == 64
45# define SHIFT_REG VRCX
46# else
47# define SHIFT_REG VRDI
48# endif
49# define CHAR_SIZE 1
50# define VPCMP vpcmpb
51# define VPMIN vpminub
52# define VPTESTN vptestnmb
53# define VPTEST vptestmb
54# define VPBROADCAST vpbroadcastb
55# define VPCMPEQ vpcmpeqb
56
57# define RCX_M VRCX
58# define KORTEST_M KORTEST
59# endif
60
61# if VEC_SIZE == 32 || (defined USE_AS_WCSRCHR)
62# define SHIFT_R(cnt, val) shrx cnt, val, val
63# else
64# define SHIFT_R(cnt, val) shr %cl, val
65# endif
66
67# define VMATCH VMM(0)
68# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
69# define PAGE_SIZE 4096
70
71 .section SECTION(.text), "ax", @progbits
72 /* Aligning entry point to 64 byte, provides better performance for
73 one vector length string. */
74ENTRY_P2ALIGN(STRRCHR, 6)
75 movl %edi, %eax
76 /* Broadcast CHAR to VMATCH. */
77 VPBROADCAST %esi, %VMATCH
78
79 andl $(PAGE_SIZE - 1), %eax
80 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
81 jg L(cross_page_boundary)
82L(page_cross_continue):
83 VMOVU (%rdi), %VMM(1)
84 /* k0 has a 1 for each zero CHAR in YMM1. */
85 VPTESTN %VMM(1), %VMM(1), %k0
86 KMOV %k0, %VGPR(rsi)
87 test %VGPR(rsi), %VGPR(rsi)
88 jz L(aligned_more)
89 /* fallthrough: zero CHAR in first VEC. */
90
91 /* K1 has a 1 for each search CHAR match in VEC(1). */
92 VPCMPEQ %VMATCH, %VMM(1), %k1
93 KMOV %k1, %VGPR(rax)
94 /* Build mask up until first zero CHAR (used to mask of
95 potential search CHAR matches past the end of the string). */
96 blsmsk %VGPR(rsi), %VGPR(rsi)
97 /* Use `and` here to remove any out of bounds matches so we can
98 do a reverse scan on `rax` to find the last match. */
99 and %VGPR(rsi), %VGPR(rax)
100 jz L(ret0)
101 /* Get last match. */
102 bsr %VGPR(rax), %VGPR(rax)
103# ifdef USE_AS_WCSRCHR
104 leaq (%rdi, %rax, CHAR_SIZE), %rax
105# else
106 addq %rdi, %rax
107# endif
108L(ret0):
109 ret
110
111 /* Returns for first vec x1/x2/x3 have hard coded backward
112 search path for earlier matches. */
113 .p2align 4,, 6
114L(first_vec_x1):
115 VPCMPEQ %VMATCH, %VMM(2), %k1
116 KMOV %k1, %VGPR(rax)
117 blsmsk %VGPR(rcx), %VGPR(rcx)
118 /* eax non-zero if search CHAR in range. */
119 and %VGPR(rcx), %VGPR(rax)
120 jnz L(first_vec_x1_return)
121
122 /* fallthrough: no match in YMM2 then need to check for earlier
123 matches (in YMM1). */
124 .p2align 4,, 4
125L(first_vec_x0_test):
126 VPCMPEQ %VMATCH, %VMM(1), %k1
127 KMOV %k1, %VGPR(rax)
128 test %VGPR(rax), %VGPR(rax)
129 jz L(ret1)
130 bsr %VGPR(rax), %VGPR(rax)
131# ifdef USE_AS_WCSRCHR
132 leaq (%rsi, %rax, CHAR_SIZE), %rax
133# else
134 addq %rsi, %rax
135# endif
136L(ret1):
137 ret
138
139 .p2align 4,, 10
140L(first_vec_x3):
141 VPCMPEQ %VMATCH, %VMM(4), %k1
142 KMOV %k1, %VGPR(rax)
143 blsmsk %VGPR(rcx), %VGPR(rcx)
144 /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
145 and %VGPR(rcx), %VGPR(rax)
146 jz L(first_vec_x1_or_x2)
147 bsr %VGPR(rax), %VGPR(rax)
148 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
149 ret
150 .p2align 4,, 4
151
152L(first_vec_x2):
153 VPCMPEQ %VMATCH, %VMM(3), %k1
154 KMOV %k1, %VGPR(rax)
155 blsmsk %VGPR(rcx), %VGPR(rcx)
156 /* Check YMM3 for last match first. If no match try YMM2/YMM1. */
157 and %VGPR(rcx), %VGPR(rax)
158 jz L(first_vec_x0_x1_test)
159 bsr %VGPR(rax), %VGPR(rax)
160 leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
161 ret
162
163 .p2align 4,, 6
164L(first_vec_x0_x1_test):
165 VPCMPEQ %VMATCH, %VMM(2), %k1
166 KMOV %k1, %VGPR(rax)
167 /* Check YMM2 for last match first. If no match try YMM1. */
168 test %VGPR(rax), %VGPR(rax)
169 jz L(first_vec_x0_test)
170 .p2align 4,, 4
171L(first_vec_x1_return):
172 bsr %VGPR(rax), %VGPR(rax)
173 leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
174 ret
175
176 .p2align 4,, 12
177L(aligned_more):
178 /* Need to keep original pointer incase VEC(1) has last match. */
179 movq %rdi, %rsi
180 andq $-VEC_SIZE, %rdi
181
182 VMOVU VEC_SIZE(%rdi), %VMM(2)
183 VPTESTN %VMM(2), %VMM(2), %k0
184 KMOV %k0, %VRCX
185 movq %rdi, %r8
186 test %VRCX, %VRCX
187 jnz L(first_vec_x1)
188
189 VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3)
190 VPTESTN %VMM(3), %VMM(3), %k0
191 KMOV %k0, %VRCX
192
193 test %VRCX, %VRCX
194 jnz L(first_vec_x2)
195
196 VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4)
197 VPTESTN %VMM(4), %VMM(4), %k0
198 KMOV %k0, %VRCX
199
200 /* Intentionally use 64-bit here. EVEX256 version needs 1-byte
201 padding for efficient nop before loop alignment. */
202 test %rcx, %rcx
203 jnz L(first_vec_x3)
204
205 andq $-(VEC_SIZE * 2), %rdi
206 .p2align 4
207L(first_aligned_loop):
208 /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
209 gurantee they don't store a match. */
210 VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5)
211 VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6)
212
213 VPCMP $4, %VMM(5), %VMATCH, %k2
214 VPCMP $4, %VMM(6), %VMATCH, %k3{%k2}
215
216 VPMIN %VMM(5), %VMM(6), %VMM(7)
217
218 VPTEST %VMM(7), %VMM(7), %k1{%k3}
219 subq $(VEC_SIZE * -2), %rdi
220 KORTEST_M %k1, %k1
221 jc L(first_aligned_loop)
222
223 VPTESTN %VMM(7), %VMM(7), %k1
224 KMOV %k1, %VRDX
225 test %VRDX, %VRDX
226 jz L(second_aligned_loop_prep)
227
228 KORTEST_M %k3, %k3
229 jnc L(return_first_aligned_loop)
230
231 .p2align 4,, 6
232L(first_vec_x1_or_x2_or_x3):
233 VPCMPEQ %VMM(4), %VMATCH, %k4
234 KMOV %k4, %VRAX
235 test %VRAX, %VRAX
236 jz L(first_vec_x1_or_x2)
237 bsr %VRAX, %VRAX
238 leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
239 ret
240
241 .p2align 4,, 8
242L(return_first_aligned_loop):
243 VPTESTN %VMM(5), %VMM(5), %k0
244 KMOV %k0, %VRCX
245 blsmsk %VRCX, %VRCX
246 jnc L(return_first_new_match_first)
247 blsmsk %VRDX, %VRDX
248 VPCMPEQ %VMM(6), %VMATCH, %k0
249 KMOV %k0, %VRAX
250 addq $VEC_SIZE, %rdi
251 and %VRDX, %VRAX
252 jnz L(return_first_new_match_ret)
253 subq $VEC_SIZE, %rdi
254L(return_first_new_match_first):
255 KMOV %k2, %VRAX
256# ifdef USE_AS_WCSRCHR
257 xorl $((1 << CHAR_PER_VEC)- 1), %VRAX
258 and %VRCX, %VRAX
259# else
260 andn %VRCX, %VRAX, %VRAX
261# endif
262 jz L(first_vec_x1_or_x2_or_x3)
263L(return_first_new_match_ret):
264 bsr %VRAX, %VRAX
265 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
266 ret
267
268 .p2align 4,, 10
269L(first_vec_x1_or_x2):
270 VPCMPEQ %VMM(3), %VMATCH, %k3
271 KMOV %k3, %VRAX
272 test %VRAX, %VRAX
273 jz L(first_vec_x0_x1_test)
274 bsr %VRAX, %VRAX
275 leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
276 ret
277
278 .p2align 4
279 /* We can throw away the work done for the first 4x checks here
280 as we have a later match. This is the 'fast' path persay. */
281L(second_aligned_loop_prep):
282L(second_aligned_loop_set_furthest_match):
283 movq %rdi, %rsi
284 VMOVA %VMM(5), %VMM(7)
285 VMOVA %VMM(6), %VMM(8)
286 .p2align 4
287L(second_aligned_loop):
288 VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5)
289 VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6)
290 VPCMP $4, %VMM(5), %VMATCH, %k2
291 VPCMP $4, %VMM(6), %VMATCH, %k3{%k2}
292
293 VPMIN %VMM(5), %VMM(6), %VMM(4)
294
295 VPTEST %VMM(4), %VMM(4), %k1{%k3}
296 subq $(VEC_SIZE * -2), %rdi
297 KMOV %k1, %VRCX
298 inc %RCX_M
299 jz L(second_aligned_loop)
300 VPTESTN %VMM(4), %VMM(4), %k1
301 KMOV %k1, %VRDX
302 test %VRDX, %VRDX
303 jz L(second_aligned_loop_set_furthest_match)
304
305 KORTEST_M %k3, %k3
306 jnc L(return_new_match)
307 /* branch here because there is a significant advantage interms
308 of output dependency chance in using edx. */
309
310L(return_old_match):
311 VPCMPEQ %VMM(8), %VMATCH, %k0
312 KMOV %k0, %VRCX
313 bsr %VRCX, %VRCX
314 jnz L(return_old_match_ret)
315
316 VPCMPEQ %VMM(7), %VMATCH, %k0
317 KMOV %k0, %VRCX
318 bsr %VRCX, %VRCX
319 subq $VEC_SIZE, %rsi
320L(return_old_match_ret):
321 leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
322 ret
323
324L(return_new_match):
325 VPTESTN %VMM(5), %VMM(5), %k0
326 KMOV %k0, %VRCX
327 blsmsk %VRCX, %VRCX
328 jnc L(return_new_match_first)
329 dec %VRDX
330 VPCMPEQ %VMM(6), %VMATCH, %k0
331 KMOV %k0, %VRAX
332 addq $VEC_SIZE, %rdi
333 and %VRDX, %VRAX
334 jnz L(return_new_match_ret)
335 subq $VEC_SIZE, %rdi
336L(return_new_match_first):
337 KMOV %k2, %VRAX
338# ifdef USE_AS_WCSRCHR
339 xorl $((1 << CHAR_PER_VEC)- 1), %VRAX
340 and %VRCX, %VRAX
341# else
342 andn %VRCX, %VRAX, %VRAX
343# endif
344 jz L(return_old_match)
345L(return_new_match_ret):
346 bsr %VRAX, %VRAX
347 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
348 ret
349
350L(cross_page_boundary):
351 /* eax contains all the page offset bits of src (rdi). `xor rdi,
352 rax` sets pointer will all page offset bits cleared so
353 offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
354 before page cross (guaranteed to be safe to read). Doing this
355 as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
356 a bit of code size. */
357 xorq %rdi, %rax
358 VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
359 VPTESTN %VMM(1), %VMM(1), %k0
360 KMOV %k0, %VRSI
361
362 /* Shift out zero CHAR matches that are before the beginning of
363 src (rdi). */
364# if VEC_SIZE == 64 || (defined USE_AS_WCSRCHR)
365 movl %edi, %ecx
366# endif
367# ifdef USE_AS_WCSRCHR
368 andl $(VEC_SIZE - 1), %ecx
369 shrl $2, %ecx
370# endif
371 SHIFT_R (%SHIFT_REG, %VRSI)
372# if VEC_SIZE == 32 || (defined USE_AS_WCSRCHR)
373 /* For strrchr-evex512 we use SHIFT_R as shr which will set zero
374 flag. */
375 test %VRSI, %VRSI
376# endif
377 jz L(page_cross_continue)
378
379 /* Found zero CHAR so need to test for search CHAR. */
380 VPCMPEQ %VMATCH, %VMM(1), %k1
381 KMOV %k1, %VRAX
382 /* Shift out search CHAR matches that are before the beginning of
383 src (rdi). */
384 SHIFT_R (%SHIFT_REG, %VRAX)
385 /* Check if any search CHAR match in range. */
386 blsmsk %VRSI, %VRSI
387 and %VRSI, %VRAX
388 jz L(ret2)
389 bsr %VRAX, %VRAX
390# ifdef USE_AS_WCSRCHR
391 leaq (%rdi, %rax, CHAR_SIZE), %rax
392# else
393 addq %rdi, %rax
394# endif
395L(ret2):
396 ret
397 /* 3 bytes from cache-line for evex. */
398 /* 0 bytes from cache-line for evex512. */
399END(STRRCHR)
400#endif
401

source code of glibc/sysdeps/x86_64/multiarch/strrchr-evex-base.S