memrchr-avx2.S source code [glibc/sysdeps/x86_64/multiarch/memrchr-avx2.S]

1	/ memrchr optimized with AVX2.*
2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	# include <sysdep.h>
22
23	# ifndef MEMRCHR
24	# define MEMRCHR __memrchr_avx2
25	# endif
26
27	# ifndef VZEROUPPER
28	# define VZEROUPPER vzeroupper
29	# endif
30
31	# ifndef SECTION
32	# define SECTION(p) p##.avx
33	# endif
34
35	# define VEC_SIZE 32
36	# define PAGE_SIZE 4096
37	.section SECTION(.text), "ax", @progbits
38	ENTRY_P2ALIGN(MEMRCHR, `6`)
39	# ifdef __ILP32__
40	/ Clear upper bits. /
41	and %RDX_LP, %RDX_LP
42	# else
43	test %RDX_LP, %RDX_LP
44	# endif
45	jz L(zero_0)
46
47	vmovd %esi, %xmm0
48	/ Get end pointer. Minus one for two reasons. 1) It is necessary for a*
49	correct page cross check and 2) it correctly sets up end ptr to be
50	subtract by lzcnt aligned. /*
51	leaq -`1`(%rdx, %rdi), %rax
52
53	vpbroadcastb %xmm0, %ymm0
54
55	/ Check if we can load 1x VEC without cross a page. /
56	testl $(PAGE_SIZE - VEC_SIZE), %eax
57	jz L(page_cross)
58
59	vpcmpeqb -(VEC_SIZE - `1`)(%rax), %ymm0, %ymm1
60	vpmovmskb %ymm1, %ecx
61	cmpq $VEC_SIZE, %rdx
62	ja L(more_1x_vec)
63
64	L(ret_vec_x0_test):
65	/ If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which*
66	will gurantee edx (len) is less than it. /*
67	lzcntl %ecx, %ecx
68
69	/ Hoist vzeroupper (not great for RTM) to save code size. This allows*
70	all logic for edx (len) <= VEC_SIZE to fit in first cache line. /*
71	COND_VZEROUPPER
72	cmpl %ecx, %edx
73	jle L(zero_0)
74	subq %rcx, %rax
75	ret
76
77	/ Fits in aligning bytes of first cache line. /
78	L(zero_0):
79	xorl %eax, %eax
80	ret
81
82	.p2align `4`,, `9`
83	L(ret_vec_x0):
84	lzcntl %ecx, %ecx
85	subq %rcx, %rax
86	L(return_vzeroupper):
87	ZERO_UPPER_VEC_REGISTERS_RETURN
88
89	.p2align `4`,, `10`
90	L(more_1x_vec):
91	testl %ecx, %ecx
92	jnz L(ret_vec_x0)
93
94	/ Align rax (string pointer). /
95	andq $-VEC_SIZE, %rax
96
97	/ Recompute remaining length after aligning. /
98	movq %rax, %rdx
99	/ Need this comparison next no matter what. /
100	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
101	subq %rdi, %rdx
102	decq %rax
103	vpmovmskb %ymm1, %ecx
104	/ Fall through for short (hotter than length). /
105	cmpq $(VEC_SIZE * `2`), %rdx
106	ja L(more_2x_vec)
107	L(last_2x_vec):
108	cmpl $VEC_SIZE, %edx
109	jbe L(ret_vec_x0_test)
110
111	testl %ecx, %ecx
112	jnz L(ret_vec_x0)
113
114	vpcmpeqb -(VEC_SIZE * `2` - `1`)(%rax), %ymm0, %ymm1
115	vpmovmskb %ymm1, %ecx
116	/ 64-bit lzcnt. This will naturally add 32 to position. /
117	lzcntq %rcx, %rcx
118	COND_VZEROUPPER
119	cmpl %ecx, %edx
120	jle L(zero_0)
121	subq %rcx, %rax
122	ret
123
124
125	/ Inexpensive place to put this regarding code size / target alignments*
126	/ ICache NLP. Necessary for 2-byte encoding of jump to page cross
127	case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
128	in first cache line. /*
129	L(page_cross):
130	movq %rax, %rsi
131	andq $-VEC_SIZE, %rsi
132	vpcmpeqb (%rsi), %ymm0, %ymm1
133	vpmovmskb %ymm1, %ecx
134	/ Shift out negative alignment (because we are starting from endptr and*
135	working backwards). /*
136	movl %eax, %r8d
137	/ notl because eax already has endptr - 1. (-x = ~(x - 1)). /
138	notl %r8d
139	shlxl %r8d, %ecx, %ecx
140	cmpq %rdi, %rsi
141	ja L(more_1x_vec)
142	lzcntl %ecx, %ecx
143	COND_VZEROUPPER
144	cmpl %ecx, %edx
145	jle L(zero_0)
146	subq %rcx, %rax
147	ret
148	.p2align `4`,, `11`
149	L(ret_vec_x1):
150	/ This will naturally add 32 to position. /
151	lzcntq %rcx, %rcx
152	subq %rcx, %rax
153	VZEROUPPER_RETURN
154	.p2align `4`,, `10`
155	L(more_2x_vec):
156	testl %ecx, %ecx
157	jnz L(ret_vec_x0)
158
159	vpcmpeqb -(VEC_SIZE * `2` - `1`)(%rax), %ymm0, %ymm1
160	vpmovmskb %ymm1, %ecx
161	testl %ecx, %ecx
162	jnz L(ret_vec_x1)
163
164
165	/ Needed no matter what. /
166	vpcmpeqb -(VEC_SIZE * `3` - `1`)(%rax), %ymm0, %ymm1
167	vpmovmskb %ymm1, %ecx
168
169	subq $(VEC_SIZE * `4`), %rdx
170	ja L(more_4x_vec)
171
172	cmpl $(VEC_SIZE * -`1`), %edx
173	jle L(ret_vec_x2_test)
174
175	L(last_vec):
176	testl %ecx, %ecx
177	jnz L(ret_vec_x2)
178
179	/ Needed no matter what. /
180	vpcmpeqb -(VEC_SIZE * `4` - `1`)(%rax), %ymm0, %ymm1
181	vpmovmskb %ymm1, %ecx
182	lzcntl %ecx, %ecx
183	subq $(VEC_SIZE * `3`), %rax
184	COND_VZEROUPPER
185	subq %rcx, %rax
186	cmpq %rax, %rdi
187	ja L(zero_2)
188	ret
189
190	/ First in aligning bytes. /
191	L(zero_2):
192	xorl %eax, %eax
193	ret
194
195	.p2align `4`,, `4`
196	L(ret_vec_x2_test):
197	lzcntl %ecx, %ecx
198	subq $(VEC_SIZE * `2`), %rax
199	COND_VZEROUPPER
200	subq %rcx, %rax
201	cmpq %rax, %rdi
202	ja L(zero_2)
203	ret
204
205
206	.p2align `4`,, `11`
207	L(ret_vec_x2):
208	/ ecx must be non-zero. /
209	bsrl %ecx, %ecx
210	leaq (VEC_SIZE * -`3` + `1`)(%rcx, %rax), %rax
211	VZEROUPPER_RETURN
212
213	.p2align `4`,, `14`
214	L(ret_vec_x3):
215	/ ecx must be non-zero. /
216	bsrl %ecx, %ecx
217	leaq (VEC_SIZE * -`4` + `1`)(%rcx, %rax), %rax
218	VZEROUPPER_RETURN
219
220
221
222	.p2align `4`
223	L(more_4x_vec):
224	testl %ecx, %ecx
225	jnz L(ret_vec_x2)
226
227	vpcmpeqb -(VEC_SIZE * `4` - `1`)(%rax), %ymm0, %ymm1
228	vpmovmskb %ymm1, %ecx
229
230	testl %ecx, %ecx
231	jnz L(ret_vec_x3)
232
233	/ Check if near end before re-aligning (otherwise might do an*
234	unnecissary loop iteration). /*
235	addq $-(VEC_SIZE * `4`), %rax
236	cmpq $(VEC_SIZE * `4`), %rdx
237	jbe L(last_4x_vec)
238
239	/ Align rax to (VEC_SIZE - 1). /
240	orq $(VEC_SIZE * `4` - `1`), %rax
241	movq %rdi, %rdx
242	/ Get endptr for loop in rdx. NB: Can't just do while rax > rdi because*
243	lengths that overflow can be valid and break the comparison. /*
244	orq $(VEC_SIZE * `4` - `1`), %rdx
245
246	.p2align `4`
247	L(loop_4x_vec):
248	/ Need this comparison next no matter what. /
249	vpcmpeqb -(VEC_SIZE * `1` - `1`)(%rax), %ymm0, %ymm1
250	vpcmpeqb -(VEC_SIZE * `2` - `1`)(%rax), %ymm0, %ymm2
251	vpcmpeqb -(VEC_SIZE * `3` - `1`)(%rax), %ymm0, %ymm3
252	vpcmpeqb -(VEC_SIZE * `4` - `1`)(%rax), %ymm0, %ymm4
253
254	vpor %ymm1, %ymm2, %ymm2
255	vpor %ymm3, %ymm4, %ymm4
256	vpor %ymm2, %ymm4, %ymm4
257	vpmovmskb %ymm4, %esi
258
259	testl %esi, %esi
260	jnz L(loop_end)
261
262	addq $(VEC_SIZE * -`4`), %rax
263	cmpq %rdx, %rax
264	jne L(loop_4x_vec)
265
266	subl %edi, %edx
267	incl %edx
268
269	L(last_4x_vec):
270	/ Used no matter what. /
271	vpcmpeqb -(VEC_SIZE * `1` - `1`)(%rax), %ymm0, %ymm1
272	vpmovmskb %ymm1, %ecx
273
274	cmpl $(VEC_SIZE * `2`), %edx
275	jbe L(last_2x_vec)
276
277	testl %ecx, %ecx
278	jnz L(ret_vec_x0_end)
279
280	vpcmpeqb -(VEC_SIZE * `2` - `1`)(%rax), %ymm0, %ymm1
281	vpmovmskb %ymm1, %ecx
282	testl %ecx, %ecx
283	jnz L(ret_vec_x1_end)
284
285	/ Used no matter what. /
286	vpcmpeqb -(VEC_SIZE * `3` - `1`)(%rax), %ymm0, %ymm1
287	vpmovmskb %ymm1, %ecx
288
289	cmpl $(VEC_SIZE * `3`), %edx
290	ja L(last_vec)
291
292	lzcntl %ecx, %ecx
293	subq $(VEC_SIZE * `2`), %rax
294	COND_VZEROUPPER
295	subq %rcx, %rax
296	cmpq %rax, %rdi
297	jbe L(ret0)
298	xorl %eax, %eax
299	L(ret0):
300	ret
301
302
303	.p2align `4`
304	L(loop_end):
305	vpmovmskb %ymm1, %ecx
306	testl %ecx, %ecx
307	jnz L(ret_vec_x0_end)
308
309	vpmovmskb %ymm2, %ecx
310	testl %ecx, %ecx
311	jnz L(ret_vec_x1_end)
312
313	vpmovmskb %ymm3, %ecx
314	/ Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)*
315	then it won't affect the result in esi (VEC4). If ecx is non-zero
316	then CHAR in VEC3 and bsrq will use that position. /*
317	salq $`32`, %rcx
318	orq %rsi, %rcx
319	bsrq %rcx, %rcx
320	leaq (VEC_SIZE * -`4` + `1`)(%rcx, %rax), %rax
321	VZEROUPPER_RETURN
322
323	.p2align `4`,, `4`
324	L(ret_vec_x1_end):
325	/ 64-bit version will automatically add 32 (VEC_SIZE). /
326	lzcntq %rcx, %rcx
327	subq %rcx, %rax
328	VZEROUPPER_RETURN
329
330	.p2align `4`,, `4`
331	L(ret_vec_x0_end):
332	lzcntl %ecx, %ecx
333	subq %rcx, %rax
334	VZEROUPPER_RETURN
335
336	/ 2 bytes until next cache line. /
337	END(MEMRCHR)
338	#endif
339

source code of glibc/sysdeps/x86_64/multiarch/memrchr-avx2.S