1/* memrchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24
25# ifndef VEC_SIZE
26# include "x86-evex256-vecs.h"
27# endif
28
29# include "reg-macros.h"
30
31# ifndef MEMRCHR
32# define MEMRCHR __memrchr_evex
33# endif
34
35# define PAGE_SIZE 4096
36# define VMATCH VMM(0)
37
38 .section SECTION(.text), "ax", @progbits
39ENTRY_P2ALIGN(MEMRCHR, 6)
40# ifdef __ILP32__
41 /* Clear upper bits. */
42 and %RDX_LP, %RDX_LP
43# else
44 test %RDX_LP, %RDX_LP
45# endif
46 jz L(zero_0)
47
48 /* Get end pointer. Minus one for three reasons. 1) It is
49 necessary for a correct page cross check and 2) it correctly
50 sets up end ptr to be subtract by lzcnt aligned. 3) it is a
51 necessary step in aligning ptr. */
52 leaq -1(%rdi, %rdx), %rax
53 vpbroadcastb %esi, %VMATCH
54
55 /* Check if we can load 1x VEC without cross a page. */
56 testl $(PAGE_SIZE - VEC_SIZE), %eax
57 jz L(page_cross)
58
59 /* Don't use rax for pointer here because EVEX has better
60 encoding with offset % VEC_SIZE == 0. */
61 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
62 KMOV %k0, %VRCX
63
64 /* If rcx is zero then lzcnt -> VEC_SIZE. NB: there is a
65 already a dependency between rcx and rsi so no worries about
66 false-dep here. */
67 lzcnt %VRCX, %VRSI
68 /* If rdx <= rsi then either 1) rcx was non-zero (there was a
69 match) but it was out of bounds or 2) rcx was zero and rdx
70 was <= VEC_SIZE so we are done scanning. */
71 cmpq %rsi, %rdx
72 /* NB: Use branch to return zero/non-zero. Common usage will
73 branch on result of function (if return is null/non-null).
74 This branch can be used to predict the ensuing one so there
75 is no reason to extend the data-dependency with cmovcc. */
76 jbe L(zero_0)
77
78 /* If rcx is zero then len must be > RDX, otherwise since we
79 already tested len vs lzcnt(rcx) (in rsi) we are good to
80 return this match. */
81 test %VRCX, %VRCX
82 jz L(more_1x_vec)
83 subq %rsi, %rax
84 ret
85
86 /* Fits in aligning bytes of first cache line for VEC_SIZE ==
87 32. */
88# if VEC_SIZE == 32
89 .p2align 4,, 2
90L(zero_0):
91 xorl %eax, %eax
92 ret
93# endif
94
95 .p2align 4,, 10
96L(more_1x_vec):
97 /* Align rax (pointer to string). */
98 andq $-VEC_SIZE, %rax
99L(page_cross_continue):
100 /* Recompute length after aligning. */
101 subq %rdi, %rax
102
103 cmpq $(VEC_SIZE * 2), %rax
104 ja L(more_2x_vec)
105
106L(last_2x_vec):
107 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
108 KMOV %k0, %VRCX
109
110 test %VRCX, %VRCX
111 jnz L(ret_vec_x0_test)
112
113 /* If VEC_SIZE == 64 need to subtract because lzcntq won't
114 implicitly add VEC_SIZE to match position. */
115# if VEC_SIZE == 64
116 subl $VEC_SIZE, %eax
117# else
118 cmpb $VEC_SIZE, %al
119# endif
120 jle L(zero_2)
121
122 /* We adjusted rax (length) for VEC_SIZE == 64 so need separate
123 offsets. */
124# if VEC_SIZE == 64
125 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
126# else
127 vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
128# endif
129 KMOV %k0, %VRCX
130 /* NB: 64-bit lzcnt. This will naturally add 32 to position for
131 VEC_SIZE == 32. */
132 lzcntq %rcx, %rcx
133 subl %ecx, %eax
134 ja L(first_vec_x1_ret)
135 /* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
136 first cache line (this is the second cache line). */
137# if VEC_SIZE == 64
138L(zero_0):
139# endif
140L(zero_2):
141 xorl %eax, %eax
142 ret
143
144 /* NB: Fits in aligning bytes before next cache line for
145 VEC_SIZE == 32. For VEC_SIZE == 64 this is attached to
146 L(first_vec_x0_test). */
147# if VEC_SIZE == 32
148L(first_vec_x1_ret):
149 leaq -1(%rdi, %rax), %rax
150 ret
151# endif
152
153 .p2align 4,, 6
154L(ret_vec_x0_test):
155 lzcnt %VRCX, %VRCX
156 subl %ecx, %eax
157 jle L(zero_2)
158# if VEC_SIZE == 64
159 /* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
160 L(first_vec_x1_ret) in the same cache line as its jmp base
161 so we might as well save code size. */
162L(first_vec_x1_ret):
163# endif
164 leaq -1(%rdi, %rax), %rax
165 ret
166
167 .p2align 4,, 6
168L(loop_last_4x_vec):
169 /* Compute remaining length. */
170 subl %edi, %eax
171L(last_4x_vec):
172 cmpl $(VEC_SIZE * 2), %eax
173 jle L(last_2x_vec)
174# if VEC_SIZE == 32
175 /* Only align for VEC_SIZE == 32. For VEC_SIZE == 64 we need
176 the spare bytes to align the loop properly. */
177 .p2align 4,, 10
178# endif
179L(more_2x_vec):
180
181 /* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
182 return if either hit. */
183 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
184 KMOV %k0, %VRCX
185
186 test %VRCX, %VRCX
187 jnz L(first_vec_x0)
188
189 vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
190 KMOV %k0, %VRCX
191 test %VRCX, %VRCX
192 jnz L(first_vec_x1)
193
194 /* Need no matter what. */
195 vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
196 KMOV %k0, %VRCX
197
198 /* Check if we are near the end. */
199 subq $(VEC_SIZE * 4), %rax
200 ja L(more_4x_vec)
201
202 test %VRCX, %VRCX
203 jnz L(first_vec_x2_test)
204
205 /* Adjust length for final check and check if we are at the end.
206 */
207 addl $(VEC_SIZE * 1), %eax
208 jle L(zero_1)
209
210 vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
211 KMOV %k0, %VRCX
212
213 lzcnt %VRCX, %VRCX
214 subl %ecx, %eax
215 ja L(first_vec_x3_ret)
216L(zero_1):
217 xorl %eax, %eax
218 ret
219L(first_vec_x3_ret):
220 leaq -1(%rdi, %rax), %rax
221 ret
222
223 .p2align 4,, 6
224L(first_vec_x2_test):
225 /* Must adjust length before check. */
226 subl $-(VEC_SIZE * 2 - 1), %eax
227 lzcnt %VRCX, %VRCX
228 subl %ecx, %eax
229 jl L(zero_4)
230 addq %rdi, %rax
231 ret
232
233
234 .p2align 4,, 10
235L(first_vec_x0):
236 bsr %VRCX, %VRCX
237 leaq (VEC_SIZE * -1)(%rdi, %rax), %rax
238 addq %rcx, %rax
239 ret
240
241 /* Fits unobtrusively here. */
242L(zero_4):
243 xorl %eax, %eax
244 ret
245
246 .p2align 4,, 10
247L(first_vec_x1):
248 bsr %VRCX, %VRCX
249 leaq (VEC_SIZE * -2)(%rdi, %rax), %rax
250 addq %rcx, %rax
251 ret
252
253 .p2align 4,, 8
254L(first_vec_x3):
255 bsr %VRCX, %VRCX
256 addq %rdi, %rax
257 addq %rcx, %rax
258 ret
259
260 .p2align 4,, 6
261L(first_vec_x2):
262 bsr %VRCX, %VRCX
263 leaq (VEC_SIZE * 1)(%rdi, %rax), %rax
264 addq %rcx, %rax
265 ret
266
267 .p2align 4,, 2
268L(more_4x_vec):
269 test %VRCX, %VRCX
270 jnz L(first_vec_x2)
271
272 vpcmpeqb (%rdi, %rax), %VMATCH, %k0
273 KMOV %k0, %VRCX
274
275 test %VRCX, %VRCX
276 jnz L(first_vec_x3)
277
278 /* Check if near end before re-aligning (otherwise might do an
279 unnecessary loop iteration). */
280 cmpq $(VEC_SIZE * 4), %rax
281 jbe L(last_4x_vec)
282
283
284 /* NB: We setup the loop to NOT use index-address-mode for the
285 buffer. This costs some instructions & code size but avoids
286 stalls due to unlaminated micro-fused instructions (as used
287 in the loop) from being forced to issue in the same group
288 (essentially narrowing the backend width). */
289
290 /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
291 because lengths that overflow can be valid and break the
292 comparison. */
293# if VEC_SIZE == 64
294 /* Use rdx as intermediate to compute rax, this gets us imm8
295 encoding which just allows the L(more_4x_vec) block to fit
296 in 1 cache-line. */
297 leaq (VEC_SIZE * 4)(%rdi), %rdx
298 leaq (VEC_SIZE * -1)(%rdx, %rax), %rax
299
300 /* No evex machine has partial register stalls. This can be
301 replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
302 changes. */
303 xorb %al, %al
304 xorb %dl, %dl
305# else
306 leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
307 andq $(VEC_SIZE * -4), %rax
308 leaq (VEC_SIZE * 4)(%rdi), %rdx
309 andq $(VEC_SIZE * -4), %rdx
310# endif
311
312
313 .p2align 4
314L(loop_4x_vec):
315 /* NB: We could do the same optimization here as we do for
316 memchr/rawmemchr by using VEX encoding in the loop for access
317 to VEX vpcmpeqb + vpternlogd. Since memrchr is not as hot as
318 memchr it may not be worth the extra code size, but if the
319 need arises it an easy ~15% perf improvement to the loop. */
320
321 cmpq %rdx, %rax
322 je L(loop_last_4x_vec)
323 /* Store 1 were not-equals and 0 where equals in k1 (used to
324 mask later on). */
325 vpcmpb $4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
326
327 /* VEC(2/3) will have zero-byte where we found a CHAR. */
328 vpxorq (VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
329 vpxorq (VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
330 vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
331
332 /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
333 where CHAR is found and VEC(2/3) have zero-byte where CHAR
334 is found. */
335 vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z}
336 vptestnmb %VMM(3), %VMM(3), %k2
337
338 addq $-(VEC_SIZE * 4), %rax
339
340 /* Any 1s and we found CHAR. */
341 KORTEST %k2, %k4
342 jz L(loop_4x_vec)
343
344
345 /* K1 has non-matches for first VEC. inc; jz will overflow rcx
346 iff all bytes where non-matches. */
347 KMOV %k1, %VRCX
348 inc %VRCX
349 jnz L(first_vec_x0_end)
350
351 vptestnmb %VMM(2), %VMM(2), %k0
352 KMOV %k0, %VRCX
353 test %VRCX, %VRCX
354 jnz L(first_vec_x1_end)
355 KMOV %k2, %VRCX
356
357 /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
358 returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
359 individually, for VEC_SIZE == 32 we combine them in a single
360 64-bit GPR. */
361# if VEC_SIZE == 64
362 test %VRCX, %VRCX
363 jnz L(first_vec_x2_end)
364 KMOV %k4, %VRCX
365# else
366 /* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
367 VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
368 result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
369 VEC(3) and bsrq will use that position. */
370 KMOV %k4, %VRSI
371 salq $32, %rcx
372 orq %rsi, %rcx
373# endif
374 bsrq %rcx, %rcx
375 addq %rcx, %rax
376 ret
377
378 .p2align 4,, 4
379L(first_vec_x0_end):
380 /* rcx has 1s at non-matches so we need to `not` it. We used
381 `inc` to test if zero so use `neg` to complete the `not` so
382 the last 1 bit represent a match. NB: (-x + 1 == ~x). */
383 neg %VRCX
384 bsr %VRCX, %VRCX
385 leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
386 ret
387
388 .p2align 4,, 10
389L(first_vec_x1_end):
390 bsr %VRCX, %VRCX
391 leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
392 ret
393
394# if VEC_SIZE == 64
395 /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
396 need return label for it. */
397 .p2align 4,, 4
398L(first_vec_x2_end):
399 bsr %VRCX, %VRCX
400 leaq (VEC_SIZE * 1)(%rcx, %rax), %rax
401 ret
402# endif
403
404
405 .p2align 4,, 4
406L(page_cross):
407 /* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
408 use movzbl to get the amount of bytes we are checking here.
409 */
410 movzbl %al, %ecx
411 andq $-VEC_SIZE, %rax
412 vpcmpeqb (%rax), %VMATCH, %k0
413 KMOV %k0, %VRSI
414
415 /* eax was comptued as %rdi + %rdx - 1 so need to add back 1
416 here. */
417 leal 1(%rcx), %r8d
418
419 /* Invert ecx to get shift count for byte matches out of range.
420 */
421 notl %ecx
422 shlx %VRCX, %VRSI, %VRSI
423
424 /* if r8 < rdx then the entire [buf, buf + len] is handled in
425 the page cross case. NB: we can't use the trick here we use
426 in the non page-cross case because we aren't checking full
427 VEC_SIZE. */
428 cmpq %r8, %rdx
429 ja L(page_cross_check)
430 lzcnt %VRSI, %VRSI
431 subl %esi, %edx
432 ja L(page_cross_ret)
433 xorl %eax, %eax
434 ret
435
436L(page_cross_check):
437 test %VRSI, %VRSI
438 jz L(page_cross_continue)
439
440 lzcnt %VRSI, %VRSI
441 subl %esi, %edx
442L(page_cross_ret):
443 leaq -1(%rdi, %rdx), %rax
444 ret
445END(MEMRCHR)
446#endif
447

source code of glibc/sysdeps/x86_64/multiarch/memrchr-evex.S