strchr-evex.S source code [glibc/sysdeps/x86_64/multiarch/strchr-evex.S]

1	/ strchr/strchrnul optimized with 256-bit EVEX instructions.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23	# include <sysdep.h>
24
25	# ifndef STRCHR
26	# define STRCHR __strchr_evex
27	# endif
28
29	# ifndef VEC_SIZE
30	# include "x86-evex256-vecs.h"
31	# endif
32
33	# ifdef USE_AS_WCSCHR
34	# define VPBROADCAST vpbroadcastd
35	# define VPCMP vpcmpd
36	# define VPCMPEQ vpcmpeqd
37	# define VPTESTN vptestnmd
38	# define VPTEST vptestmd
39	# define VPMINU vpminud
40	# define CHAR_REG esi
41	# define SHIFT_REG rcx
42	# define CHAR_SIZE 4
43
44	# define USE_WIDE_CHAR
45	# else
46	# define VPBROADCAST vpbroadcastb
47	# define VPCMP vpcmpb
48	# define VPCMPEQ vpcmpeqb
49	# define VPTESTN vptestnmb
50	# define VPTEST vptestmb
51	# define VPMINU vpminub
52	# define CHAR_REG sil
53	# define SHIFT_REG rdi
54	# define CHAR_SIZE 1
55	# endif
56
57	# include "reg-macros.h"
58
59	# if VEC_SIZE == 64
60	# define MASK_GPR rcx
61	# define LOOP_REG rax
62
63	# define COND_MASK(k_reg) {%k_reg}
64	# else
65	# define MASK_GPR rax
66	# define LOOP_REG rdi
67
68	# define COND_MASK(k_reg)
69	# endif
70
71	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
72
73
74	# if CHAR_PER_VEC == 64
75	# define LAST_VEC_OFFSET (VEC_SIZE * 3)
76	# define TESTZ(reg) incq %VGPR_SZ(reg, 64)
77	# else
78
79	# if CHAR_PER_VEC == 32
80	# define TESTZ(reg) incl %VGPR_SZ(reg, 32)
81	# elif CHAR_PER_VEC == 16
82	# define TESTZ(reg) incw %VGPR_SZ(reg, 16)
83	# else
84	# define TESTZ(reg) incb %VGPR_SZ(reg, 8)
85	# endif
86
87	# define LAST_VEC_OFFSET (VEC_SIZE * 2)
88	# endif
89
90	# define VMATCH VMM(0)
91
92	# define PAGE_SIZE 4096
93
94	.section SECTION(.text), "ax", @progbits
95	ENTRY_P2ALIGN (STRCHR, `6`)
96	/ Broadcast CHAR to VEC_0. /
97	VPBROADCAST %esi, %VMATCH
98	movl %edi, %eax
99	andl $(PAGE_SIZE - `1`), %eax
100	/ Check if we cross page boundary with one vector load.*
101	Otherwise it is safe to use an unaligned load. /*
102	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
103	ja L(cross_page_boundary)
104
105
106	/ Check the first VEC_SIZE bytes. Search for both CHAR and the*
107	null bytes. /*
108	VMOVU (%rdi), %VMM(`1`)
109	/ Leaves only CHARS matching esi as 0. /
110	vpxorq %VMM(`1`), %VMATCH, %VMM(`2`)
111	VPMINU %VMM(`2`), %VMM(`1`), %VMM(`2`)
112	/ Each bit in K0 represents a CHAR or a null byte in VEC_1. /
113	VPTESTN %VMM(`2`), %VMM(`2`), %k0
114	KMOV %k0, %VRAX
115	# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL
116	/ If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so*
117	that all logic for match/null in first VEC first in 1x cache
118	lines. This has a slight cost to larger sizes. /*
119	bsf %VRAX, %VRAX
120	jz L(aligned_more)
121	# else
122	test %VRAX, %VRAX
123	jz L(aligned_more)
124	bsf %VRAX, %VRAX
125	# endif
126	# ifndef USE_AS_STRCHRNUL
127	/ Found CHAR or the null byte. /
128	cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
129	/ NB: Use a branch instead of cmovcc here. The expectation is*
130	that with strchr the user will branch based on input being
131	null. Since this branch will be 100% predictive of the user
132	branch a branch miss here should save what otherwise would
133	be branch miss in the user code. Otherwise using a branch 1)
134	saves code size and 2) is faster in highly predictable
135	environments. /*
136	jne L(zero)
137	# endif
138	# ifdef USE_AS_WCSCHR
139	/ NB: Multiply wchar_t count by 4 to get the number of bytes.*
140	*/
141	leaq (%rdi, %rax, CHAR_SIZE), %rax
142	# else
143	addq %rdi, %rax
144	# endif
145	ret
146
147	# ifndef USE_AS_STRCHRNUL
148	L(zero):
149	xorl %eax, %eax
150	ret
151	# endif
152
153	.p2align `4`,, `2`
154	L(first_vec_x3):
155	subq $-(VEC_SIZE * `2`), %rdi
156	# if VEC_SIZE == 32
157	/ Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32.*
158	For VEC_SIZE == 64 the registers don't match. /*
159	L(last_vec_x2):
160	# endif
161	L(first_vec_x1):
162	/ Use bsf here to save 1-byte keeping keeping the block in 1x*
163	fetch block. eax guaranteed non-zero. /*
164	bsf %VRCX, %VRCX
165	# ifndef USE_AS_STRCHRNUL
166	/ Found CHAR or the null byte. /
167	cmp (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
168	jne L(zero)
169	# endif
170	/ NB: Multiply sizeof char type (1 or 4) to get the number of*
171	bytes. /*
172	leaq (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax
173	ret
174
175	.p2align `4`,, `2`
176	L(first_vec_x4):
177	subq $-(VEC_SIZE * `2`), %rdi
178	L(first_vec_x2):
179	# ifndef USE_AS_STRCHRNUL
180	/ Check to see if first match was CHAR (k0) or null (k1). /
181	KMOV %k0, %VRAX
182	tzcnt %VRAX, %VRAX
183	KMOV %k1, %VRCX
184	/ bzhil will not be 0 if first match was null. /
185	bzhi %VRAX, %VRCX, %VRCX
186	jne L(zero)
187	# else
188	/ Combine CHAR and null matches. /
189	KOR %k0, %k1, %k0
190	KMOV %k0, %VRAX
191	bsf %VRAX, %VRAX
192	# endif
193	/ NB: Multiply sizeof char type (1 or 4) to get the number of*
194	bytes. /*
195	leaq (VEC_SIZE * `2`)(%rdi, %rax, CHAR_SIZE), %rax
196	ret
197
198	# ifdef USE_AS_STRCHRNUL
199	/ We use this as a hook to get imm8 encoding for the jmp to*
200	L(page_cross_boundary). This allows the hot case of a
201	match/null-term in first VEC to fit entirely in 1 cache
202	line. /*
203	L(cross_page_boundary):
204	jmp L(cross_page_boundary_real)
205	# endif
206
207	.p2align `4`
208	L(aligned_more):
209	L(cross_page_continue):
210	/ Align data to VEC_SIZE. /
211	andq $-VEC_SIZE, %rdi
212
213	/ Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time*
214	since data is only aligned to VEC_SIZE. Use two alternating
215	methods for checking VEC to balance latency and port
216	contention. /*
217
218	/ Method(1) with 8c latency:*
219	For VEC_SIZE == 32:
220	p0 1.83, p1 * 0.83, p5 * 1.33*
221	For VEC_SIZE == 64:
222	p0 2.50, p1 * 0.00, p5 * 1.50 /
223	VMOVA (VEC_SIZE)(%rdi), %VMM(`1`)
224	/ Leaves only CHARS matching esi as 0. /
225	vpxorq %VMM(`1`), %VMATCH, %VMM(`2`)
226	VPMINU %VMM(`2`), %VMM(`1`), %VMM(`2`)
227	/ Each bit in K0 represents a CHAR or a null byte in VEC_1. /
228	VPTESTN %VMM(`2`), %VMM(`2`), %k0
229	KMOV %k0, %VRCX
230	test %VRCX, %VRCX
231	jnz L(first_vec_x1)
232
233	/ Method(2) with 6c latency:*
234	For VEC_SIZE == 32:
235	p0 1.00, p1 * 0.00, p5 * 2.00*
236	For VEC_SIZE == 64:
237	p0 1.00, p1 * 0.00, p5 * 2.00 /
238	VMOVA (VEC_SIZE * `2`)(%rdi), %VMM(`1`)
239	/ Each bit in K0 represents a CHAR in VEC_1. /
240	VPCMPEQ %VMM(`1`), %VMATCH, %k0
241	/ Each bit in K1 represents a CHAR in VEC_1. /
242	VPTESTN %VMM(`1`), %VMM(`1`), %k1
243	KORTEST %k0, %k1
244	jnz L(first_vec_x2)
245
246	/ By swapping between Method 1/2 we get more fair port*
247	distrubition and better throughput. /*
248
249	VMOVA (VEC_SIZE * `3`)(%rdi), %VMM(`1`)
250	/ Leaves only CHARS matching esi as 0. /
251	vpxorq %VMM(`1`), %VMATCH, %VMM(`2`)
252	VPMINU %VMM(`2`), %VMM(`1`), %VMM(`2`)
253	/ Each bit in K0 represents a CHAR or a null byte in VEC_1. /
254	VPTESTN %VMM(`2`), %VMM(`2`), %k0
255	KMOV %k0, %VRCX
256	test %VRCX, %VRCX
257	jnz L(first_vec_x3)
258
259	VMOVA (VEC_SIZE * `4`)(%rdi), %VMM(`1`)
260	/ Each bit in K0 represents a CHAR in VEC_1. /
261	VPCMPEQ %VMM(`1`), %VMATCH, %k0
262	/ Each bit in K1 represents a CHAR in VEC_1. /
263	VPTESTN %VMM(`1`), %VMM(`1`), %k1
264	KORTEST %k0, %k1
265	jnz L(first_vec_x4)
266
267	/ Align data to VEC_SIZE * 4 for the loop. /
268	# if VEC_SIZE == 64
269	/ Use rax for the loop reg as it allows to the loop to fit in*
270	exactly 2-cache-lines. (more efficient imm32 + gpr
271	encoding). /*
272	leaq (VEC_SIZE)(%rdi), %rax
273	/ No partial register stalls on evex512 processors. /
274	xorb %al, %al
275	# else
276	/ For VEC_SIZE == 32 continue using rdi for loop reg so we can*
277	reuse more code and save space. /*
278	addq $VEC_SIZE, %rdi
279	andq $-(VEC_SIZE * `4`), %rdi
280	# endif
281	.p2align `4`
282	L(loop_4x_vec):
283	/ Check 4x VEC at a time. No penalty for imm32 offset with evex*
284	encoding (if offset % VEC_SIZE == 0). /*
285	VMOVA (VEC_SIZE * `4`)(%LOOP_REG), %VMM(`1`)
286	VMOVA (VEC_SIZE * `5`)(%LOOP_REG), %VMM(`2`)
287	VMOVA (VEC_SIZE * `6`)(%LOOP_REG), %VMM(`3`)
288	VMOVA (VEC_SIZE * `7`)(%LOOP_REG), %VMM(`4`)
289
290	/ Collect bits where VEC_1 does NOT match esi. This is later*
291	use to mask of results (getting not matches allows us to
292	save an instruction on combining). /*
293	VPCMP $`4`, %VMATCH, %VMM(`1`), %k1
294
295	/ Two methods for loop depending on VEC_SIZE. This is because*
296	with zmm registers VPMINU can only run on p0 (as opposed to
297	p0/p1 for ymm) so it is less preferred. /*
298	# if VEC_SIZE == 32
299	/ For VEC_2 and VEC_3 use xor to set the CHARs matching esi to*
300	zero. /*
301	vpxorq %VMM(`2`), %VMATCH, %VMM(`6`)
302	vpxorq %VMM(`3`), %VMATCH, %VMM(`7`)
303
304	/ Find non-matches in VEC_4 while combining with non-matches*
305	from VEC_1. NB: Try and use masked predicate execution on
306	instructions that have mask result as it has no latency
307	penalty. /*
308	VPCMP $`4`, %VMATCH, %VMM(`4`), %k4{%k1}
309
310	/ Combined zeros from VEC_1 / VEC_2 (search for null term). /
311	VPMINU %VMM(`1`), %VMM(`2`), %VMM(`2`)
312
313	/ Use min to select all zeros from either xor or end of*
314	string). /*
315	VPMINU %VMM(`3`), %VMM(`7`), %VMM(`3`)
316	VPMINU %VMM(`2`), %VMM(`6`), %VMM(`2`)
317
318	/ Combined zeros from VEC_2 / VEC_3 (search for null term). /
319	VPMINU %VMM(`3`), %VMM(`4`), %VMM(`4`)
320
321	/ Combined zeros from VEC_2 / VEC_4 (this has all null term and*
322	esi matches for VEC_2 / VEC_3). /*
323	VPMINU %VMM(`2`), %VMM(`4`), %VMM(`4`)
324	# else
325	/ Collect non-matches for VEC_2. /
326	VPCMP $`4`, %VMM(`2`), %VMATCH, %k2
327
328	/ Combined zeros from VEC_1 / VEC_2 (search for null term). /
329	VPMINU %VMM(`1`), %VMM(`2`), %VMM(`2`)
330
331	/ Find non-matches in VEC_3/VEC_4 while combining with non-*
332	matches from VEC_1/VEC_2 respectively. /*
333	VPCMP $`4`, %VMM(`3`), %VMATCH, %k3{%k1}
334	VPCMP $`4`, %VMM(`4`), %VMATCH, %k4{%k2}
335
336	/ Finish combining zeros in all VECs. /
337	VPMINU %VMM(`3`), %VMM(`4`), %VMM(`4`)
338
339	/ Combine in esi matches for VEC_3 (if there was a match with*
340	esi, the corresponding bit in %k3 is zero so the
341	VPMINU_MASKZ will have a zero in the result). NB: This make
342	the VPMINU 3c latency. The only way to avoid it is to
343	create a 12c dependency chain on all the `VPCMP $4, ...`
344	which has higher total latency. /*
345	VPMINU %VMM(`2`), %VMM(`4`), %VMM(`4`){%k3}{z}
346	# endif
347	VPTEST %VMM(`4`), %VMM(`4`), %k0{%k4}
348	KMOV %k0, %VRDX
349	subq $-(VEC_SIZE * `4`), %LOOP_REG
350
351	/ TESTZ is inc using the proper register width depending on*
352	CHAR_PER_VEC. An esi match or null-term match leaves a zero-
353	bit in rdx so inc won't overflow and won't be zero. /*
354	TESTZ (rdx)
355	jz L(loop_4x_vec)
356
357	VPTEST %VMM(`1`), %VMM(`1`), %k0{%k1}
358	KMOV %k0, %VGPR(MASK_GPR)
359	TESTZ (MASK_GPR)
360	# if VEC_SIZE == 32
361	/ We can reuse the return code in page_cross logic for VEC_SIZE*
362	== 32. /*
363	jnz L(last_vec_x1_vec_size32)
364	# else
365	jnz L(last_vec_x1_vec_size64)
366	# endif
367
368
369	/ COND_MASK integrates the esi matches for VEC_SIZE == 64. For*
370	VEC_SIZE == 32 they are already integrated. /*
371	VPTEST %VMM(`2`), %VMM(`2`), %k0 COND_MASK(k2)
372	KMOV %k0, %VRCX
373	TESTZ (rcx)
374	jnz L(last_vec_x2)
375
376	VPTEST %VMM(`3`), %VMM(`3`), %k0 COND_MASK(k3)
377	KMOV %k0, %VRCX
378	# if CHAR_PER_VEC == 64
379	TESTZ (rcx)
380	jnz L(last_vec_x3)
381	# else
382	salq $CHAR_PER_VEC, %rdx
383	TESTZ (rcx)
384	orq %rcx, %rdx
385	# endif
386
387	bsfq %rdx, %rdx
388
389	# ifndef USE_AS_STRCHRNUL
390	/ Check if match was CHAR or null. /
391	cmp (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG
392	jne L(zero_end)
393	# endif
394	/ NB: Multiply sizeof char type (1 or 4) to get the number of*
395	bytes. /*
396	leaq (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax
397	ret
398
399	# ifndef USE_AS_STRCHRNUL
400	L(zero_end):
401	xorl %eax, %eax
402	ret
403	# endif
404
405
406	/ Separate return label for last VEC1 because for VEC_SIZE ==*
407	32 we can reuse return code in L(page_cross) but VEC_SIZE ==
408	64 has mismatched registers. /*
409	# if VEC_SIZE == 64
410	.p2align `4`,, `8`
411	L(last_vec_x1_vec_size64):
412	bsf %VRCX, %VRCX
413	# ifndef USE_AS_STRCHRNUL
414	/ Check if match was null. /
415	cmp (%rax, %rcx, CHAR_SIZE), %CHAR_REG
416	jne L(zero_end)
417	# endif
418	# ifdef USE_AS_WCSCHR
419	/ NB: Multiply wchar_t count by 4 to get the number of bytes.*
420	*/
421	leaq (%rax, %rcx, CHAR_SIZE), %rax
422	# else
423	addq %rcx, %rax
424	# endif
425	ret
426
427	/ Since we can't combine the last 2x matches for CHAR_PER_VEC*
428	== 64 we need return label for last VEC3. /*
429	# if CHAR_PER_VEC == 64
430	.p2align `4`,, `8`
431	L(last_vec_x3):
432	addq $VEC_SIZE, %LOOP_REG
433	# endif
434
435	/ Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't*
436	reuse L(first_vec_x3) due to register mismatch. /*
437	L(last_vec_x2):
438	bsf %VGPR(MASK_GPR), %VGPR(MASK_GPR)
439	# ifndef USE_AS_STRCHRNUL
440	/ Check if match was null. /
441	cmp (VEC_SIZE * `1`)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
442	jne L(zero_end)
443	# endif
444	/ NB: Multiply sizeof char type (1 or 4) to get the number of*
445	bytes. /*
446	leaq (VEC_SIZE * `1`)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax
447	ret
448	# endif
449
450	/ Cold case for crossing page with first load. /
451	.p2align `4`,, `10`
452	# ifndef USE_AS_STRCHRNUL
453	L(cross_page_boundary):
454	# endif
455	L(cross_page_boundary_real):
456	/ Align rdi. /
457	xorq %rdi, %rax
458	VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(`1`)
459	/ Use high latency method of getting matches to save code size.*
460	*/
461
462	/ K1 has 1s where VEC(1) does NOT match esi. /
463	VPCMP $`4`, %VMM(`1`), %VMATCH, %k1
464	/ K0 has ones where K1 is 1 (non-match with esi), and non-zero*
465	(null). /*
466	VPTEST %VMM(`1`), %VMM(`1`), %k0{%k1}
467	KMOV %k0, %VRAX
468	/ Remove the leading bits. /
469	# ifdef USE_AS_WCSCHR
470	movl %edi, %VGPR_SZ(SHIFT_REG, `32`)
471	/ NB: Divide shift count by 4 since each bit in K1 represent 4*
472	bytes. /*
473	sarl $`2`, %VGPR_SZ(SHIFT_REG, `32`)
474	andl $(CHAR_PER_VEC - `1`), %VGPR_SZ(SHIFT_REG, `32`)
475
476	/ if wcsrchr we need to reverse matches as we can't rely on*
477	signed shift to bring in ones. There is not sarx for
478	gpr8/16. Also not we can't use inc here as the lower bits
479	represent matches out of range so we can't rely on overflow.
480	*/
481	xorl $((`1` << CHAR_PER_VEC)- `1`), %eax
482	# endif
483	/ Use arithmetic shift so that leading 1s are filled in. /
484	sarx %VGPR(SHIFT_REG), %VRAX, %VRAX
485	/ If eax is all ones then no matches for esi or NULL. /
486
487	# ifdef USE_AS_WCSCHR
488	test %VRAX, %VRAX
489	# else
490	inc %VRAX
491	# endif
492	jz L(cross_page_continue)
493
494	.p2align `4`,, `10`
495	L(last_vec_x1_vec_size32):
496	bsf %VRAX, %VRAX
497	# ifdef USE_AS_WCSCHR
498	/ NB: Multiply wchar_t count by 4 to get the number of bytes.*
499	*/
500	leaq (%rdi, %rax, CHAR_SIZE), %rax
501	# else
502	addq %rdi, %rax
503	# endif
504	# ifndef USE_AS_STRCHRNUL
505	/ Check to see if match was CHAR or null. /
506	cmp (%rax), %CHAR_REG
507	jne L(zero_end_0)
508	# endif
509	ret
510	# ifndef USE_AS_STRCHRNUL
511	L(zero_end_0):
512	xorl %eax, %eax
513	ret
514	# endif
515
516	END (STRCHR)
517	#endif
518

source code of glibc/sysdeps/x86_64/multiarch/strchr-evex.S