1/* rawmemchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24# ifndef VEC_SIZE
25# include "x86-evex256-vecs.h"
26# endif
27
28# ifndef RAWMEMCHR
29# define RAWMEMCHR __rawmemchr_evex
30# endif
31
32
33# define PC_SHIFT_GPR rdi
34# define REG_WIDTH VEC_SIZE
35# define VPTESTN vptestnmb
36# define VPBROADCAST vpbroadcastb
37# define VPMINU vpminub
38# define VPCMP vpcmpb
39# define VPCMPEQ vpcmpeqb
40# define CHAR_SIZE 1
41
42# include "reg-macros.h"
43
44/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
45 doesn't have VEX encoding), use VEX encoding in loop so we
46 can use vpcmpeqb + vptern which is more efficient than the
47 EVEX alternative. */
48# if defined USE_IN_RTM || VEC_SIZE == 64
49# undef COND_VZEROUPPER
50# undef VZEROUPPER_RETURN
51# undef VZEROUPPER
52
53
54# define COND_VZEROUPPER
55# define VZEROUPPER_RETURN ret
56# define VZEROUPPER
57
58# define USE_TERN_IN_LOOP 0
59# else
60# define USE_TERN_IN_LOOP 1
61# undef VZEROUPPER
62# define VZEROUPPER vzeroupper
63# endif
64
65# define CHAR_PER_VEC VEC_SIZE
66
67# if CHAR_PER_VEC == 64
68
69# define TAIL_RETURN_LBL first_vec_x2
70# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
71
72# define FALLTHROUGH_RETURN_LBL first_vec_x3
73# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
74
75# else /* !(CHAR_PER_VEC == 64) */
76
77# define TAIL_RETURN_LBL first_vec_x3
78# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
79
80# define FALLTHROUGH_RETURN_LBL first_vec_x2
81# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
82# endif /* !(CHAR_PER_VEC == 64) */
83
84
85# define VMATCH VMM(0)
86# define VMATCH_LO VMM_lo(0)
87
88# define PAGE_SIZE 4096
89
90 .section SECTION(.text), "ax", @progbits
91ENTRY_P2ALIGN (RAWMEMCHR, 6)
92 VPBROADCAST %esi, %VMATCH
93 /* Check if we may cross page boundary with one vector load. */
94 movl %edi, %eax
95 andl $(PAGE_SIZE - 1), %eax
96 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
97 ja L(page_cross)
98
99 VPCMPEQ (%rdi), %VMATCH, %k0
100 KMOV %k0, %VRAX
101
102 test %VRAX, %VRAX
103 jz L(aligned_more)
104L(first_vec_x0):
105 bsf %VRAX, %VRAX
106 addq %rdi, %rax
107 ret
108
109 .p2align 4,, 4
110L(first_vec_x4):
111 bsf %VRAX, %VRAX
112 leaq (VEC_SIZE * 4)(%rdi, %rax), %rax
113 ret
114
115 /* For VEC_SIZE == 32 we can fit this in aligning bytes so might
116 as well place it more locally. For VEC_SIZE == 64 we reuse
117 return code at the end of loop's return. */
118# if VEC_SIZE == 32
119 .p2align 4,, 4
120L(FALLTHROUGH_RETURN_LBL):
121 bsf %VRAX, %VRAX
122 leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
123 ret
124# endif
125
126 .p2align 4,, 6
127L(page_cross):
128 /* eax has lower page-offset bits of rdi so xor will zero them
129 out. */
130 xorq %rdi, %rax
131 VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
132 KMOV %k0, %VRAX
133
134 /* Shift out out-of-bounds matches. */
135 shrx %VRDI, %VRAX, %VRAX
136 test %VRAX, %VRAX
137 jnz L(first_vec_x0)
138
139 .p2align 4,, 10
140L(aligned_more):
141L(page_cross_continue):
142 /* Align pointer. */
143 andq $(VEC_SIZE * -1), %rdi
144
145 VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
146 KMOV %k0, %VRAX
147 test %VRAX, %VRAX
148 jnz L(first_vec_x1)
149
150 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
151 KMOV %k0, %VRAX
152 test %VRAX, %VRAX
153 jnz L(first_vec_x2)
154
155 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
156 KMOV %k0, %VRAX
157 test %VRAX, %VRAX
158 jnz L(first_vec_x3)
159
160 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
161 KMOV %k0, %VRAX
162 test %VRAX, %VRAX
163 jnz L(first_vec_x4)
164
165 subq $-(VEC_SIZE * 1), %rdi
166# if VEC_SIZE == 64
167 /* Saves code size. No evex512 processor has partial register
168 stalls. If that change this can be replaced with `andq
169 $-(VEC_SIZE * 4), %rdi`. */
170 xorb %dil, %dil
171# else
172 andq $-(VEC_SIZE * 4), %rdi
173# endif
174
175# if USE_TERN_IN_LOOP
176 /* copy VMATCH to low ymm so we can use vpcmpeq which is not
177 encodable with EVEX registers. NB: this is VEC_SIZE == 32
178 only as there is no way to encode vpcmpeq with zmm0-15. */
179 vmovdqa64 %VMATCH, %VMATCH_LO
180# endif
181
182 .p2align 4
183L(loop_4x_vec):
184 /* Two versions of the loop. One that does not require
185 vzeroupper by not using ymm0-15 and another does that
186 require vzeroupper because it uses ymm0-15. The reason why
187 ymm0-15 is used at all is because there is no EVEX encoding
188 vpcmpeq and with vpcmpeq this loop can be performed more
189 efficiently. The non-vzeroupper version is safe for RTM
190 while the vzeroupper version should be preferred if RTM are
191 not supported. Which loop version we use is determined by
192 USE_TERN_IN_LOOP. */
193
194# if USE_TERN_IN_LOOP
195 /* Since vptern can only take 3x vectors fastest to do 1 vec
196 separately with EVEX vpcmp. */
197 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
198 /* Compare 3x with vpcmpeq and or them all together with vptern.
199 */
200
201 VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
202 subq $(VEC_SIZE * -4), %rdi
203 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
204 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
205
206 /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
207 VEC_lo(4). */
208 vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
209 vpmovmskb %VMM_lo(4), %VRCX
210
211 KMOV %k1, %eax
212
213 /* NB: rax has match from first VEC and rcx has matches from
214 VEC 2-4. If rax is non-zero we will return that match. If
215 rax is zero adding won't disturb the bits in rcx. */
216 add %rax, %rcx
217# else
218 /* Loop version that uses EVEX encoding. */
219 VPCMP $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
220 vpxorq (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
221 vpxorq (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
222 VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3
223 VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z}
224 VPTESTN %VMM(3), %VMM(3), %k2
225 subq $(VEC_SIZE * -4), %rdi
226 KORTEST %k2, %k3
227# endif
228 jz L(loop_4x_vec)
229
230# if USE_TERN_IN_LOOP
231 test %VRAX, %VRAX
232# else
233 KMOV %k1, %VRAX
234 inc %VRAX
235# endif
236 jnz L(last_vec_x0)
237
238
239# if USE_TERN_IN_LOOP
240 vpmovmskb %VMM_lo(2), %VRAX
241# else
242 VPTESTN %VMM(2), %VMM(2), %k1
243 KMOV %k1, %VRAX
244# endif
245 test %VRAX, %VRAX
246 jnz L(last_vec_x1)
247
248
249# if USE_TERN_IN_LOOP
250 vpmovmskb %VMM_lo(3), %VRAX
251# else
252 KMOV %k2, %VRAX
253# endif
254
255 /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
256 (only if used VEX encoded loop). */
257 COND_VZEROUPPER
258
259 /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
260 returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
261 individually, for VEC_SIZE == 32 we combine them in a single
262 64-bit GPR. */
263# if CHAR_PER_VEC == 64
264# if USE_TERN_IN_LOOP
265# error "Unsupported"
266# endif
267
268
269 /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */
270 test %VRAX, %VRAX
271 jnz L(first_vec_x2)
272 KMOV %k3, %VRAX
273L(FALLTHROUGH_RETURN_LBL):
274# else
275 /* CHAR_PER_VEC <= 32 so we can combine the results from the
276 last 2x VEC. */
277# if !USE_TERN_IN_LOOP
278 KMOV %k3, %VRCX
279# endif
280 salq $CHAR_PER_VEC, %rcx
281 addq %rcx, %rax
282# endif
283 bsf %rax, %rax
284 leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
285 ret
286
287 .p2align 4,, 8
288L(TAIL_RETURN_LBL):
289 bsf %rax, %rax
290 leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
291 ret
292
293 .p2align 4,, 8
294L(last_vec_x1):
295 COND_VZEROUPPER
296L(first_vec_x1):
297 bsf %VRAX, %VRAX
298 leaq (VEC_SIZE * 1)(%rdi, %rax), %rax
299 ret
300
301 .p2align 4,, 8
302L(last_vec_x0):
303 COND_VZEROUPPER
304 bsf %VRAX, %VRAX
305 addq %rdi, %rax
306 ret
307END (RAWMEMCHR)
308#endif
309

source code of glibc/sysdeps/x86_64/multiarch/rawmemchr-evex.S