strncat-evex.S source code [glibc/sysdeps/x86_64/multiarch/strncat-evex.S]

1	/ {wcs\|str}ncat with 256/512-bit EVEX.*
2	Copyright (C) 2022-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23	/ Use evex-masked stores for small sizes. Turned off at the*
24	moment. /*
25	# define USE_EVEX_MASKED_STORE 0
26
27	# include <sysdep.h>
28
29	# ifndef VEC_SIZE
30	# include "x86-evex256-vecs.h"
31	# endif
32
33	# ifndef STRNCAT
34	# define STRNCAT __strncat_evex
35	# endif
36
37
38	# ifdef USE_AS_WCSCPY
39	# define MOVCHAR movl
40	# define VMOVU_MASK vmovdqu32
41	# define VPMIN vpminud
42	# define VPTESTN vptestnmd
43	# define VPTEST vptestmd
44	# define VPCMPEQ vpcmpeqd
45	# define CHAR_SIZE 4
46
47	# define REP_MOVS rep movsd
48
49	# define VMASK_REG VR10
50	# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst
51
52	# define USE_WIDE_CHAR
53	# else
54	# define MOVCHAR movb
55	# define VMOVU_MASK vmovdqu8
56	# define VPMIN vpminub
57	# define VPTESTN vptestnmb
58	# define VPTEST vptestmb
59	# define VPCMPEQ vpcmpeqb
60	# define CHAR_SIZE 1
61
62	# define REP_MOVS rep movsb
63
64	# define VMASK_REG VRCX
65	# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst
66
67	# endif
68
69	# include "strncpy-or-cat-overflow-def.h"
70
71	# include "reg-macros.h"
72
73
74	# define VZERO VMM(7)
75	# define VZERO_128 VMM_128(7)
76
77	# define PAGE_SIZE 4096
78	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
79
80	.section SECTION(.text), "ax", @progbits
81	ENTRY(STRNCAT)
82	# ifdef __ILP32__
83	/ Clear the upper 32 bits. /
84	movl %edx, %edx
85	# endif
86
87	movq %rdi, %rax
88
89	/ NB: It's safe to filter out zero-length strings WITHOUT*
90	setting null-term. Destination MUST be a null-terminated
91	string so essentially the work is already done. /*
92	# ifdef USE_AS_WCSCPY
93	leaq -`1`(%rdx), %rcx
94	shrq $`56`, %rcx
95	jnz L(zero_len)
96	# else
97	test %rdx, %rdx
98	jle L(zero_len)
99	# endif
100
101	# include "strcat-strlen-evex.h.S"
102
103	movl %esi, %ecx
104	andl $(PAGE_SIZE - `1`), %ecx
105	cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
106	ja L(page_cross)
107	L(page_cross_continue):
108	VMOVU (%rsi), %VMM(`0`)
109	VPTESTN %VMM(`0`), %VMM(`0`), %k0
110
111	/ If USE_EVEX_MASK_STORE is enabled then we just handle length*
112	<= CHAR_PER_VEC with masked instructions (which have
113	potential for dramatically bad perf if dst splits a page and
114	is not in the TLB). /*
115	# if USE_EVEX_MASKED_STORE
116	KMOV %k0, %VRCX
117	FIND_FIRST_ONE (VRCX, VR8)
118	cmpq %r8, %rdx
119	jbe L(less_1x_vec)
120
121	test %VRCX, %VRCX
122	jz L(more_1x_vec)
123
124	blsmsk %VRCX, %VRCX
125	KMOV %VRCX, %k1
126	VMOVU_MASK %VMM(`0`), (%rdi){%k1}
127	ret
128
129	L(less_1x_vec):
130	mov $-`1`, %VRCX
131	bzhi %VRDX, %VRCX, %VRCX
132	KMOV %VRCX, %k1
133	MOVCHAR $`0`, (%rdi, %rdx, CHAR_SIZE)
134	VMOVU_MASK %VMM(`0`), (%rdi){%k1}
135
136	ret
137	# else
138	KMOV %k0, %VMASK_REG
139	/ tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf*
140	%VMASK_REG, %VRCX` for wcsncat. /*
141	FIND_FIRST_ONE (VMASK_REG, VRCX)
142	cmpq %rcx, %rdx
143	jbe L(less_1x_vec)
144
145	/ If there were no zero-CHARs (rcx was zero before*
146	FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. /*
147	cmpl $CHAR_PER_VEC, %ecx
148	je L(more_1x_vec)
149
150	movl %ecx, %edx
151
152	L(less_1x_vec):
153	# if VEC_SIZE == 64
154	cmpl $(`32` / CHAR_SIZE), %edx
155	jae L(copy_32_63)
156	# endif
157
158	cmpl $(`16` / CHAR_SIZE), %edx
159	jae L(copy_16_31)
160
161
162	cmpl $(`8` / CHAR_SIZE), %edx
163	jae L(copy_8_15)
164
165	# ifdef USE_AS_WCSCPY
166	vmovd %VMM_128(`0`), (%rdi)
167	MOVCHAR $`0`, (%rdi, %rdx, CHAR_SIZE)
168	ret
169	# else
170
171	cmpl $`4`, %edx
172	jae L(copy_4_7)
173
174	movzbl (%rsi), %ecx
175	cmpl $`1`, %edx
176	jbe L(set_null_term)
177
178	movzwl `1`(%rsi), %esi
179	movw %si, `1`(%rdi)
180
181	.p2align `4`,, `1`
182	L(set_null_term):
183	movb %cl, (%rdi)
184	MOVCHAR $`0`, (%rdi, %rdx)
185	ret
186	# endif
187
188	# if VEC_SIZE == 64
189	.p2align `4`,, `6`
190	L(copy_32_63):
191	VMOVU -(`32`)(%rsi, %rdx, CHAR_SIZE), %VMM_256(`1`)
192	VMOVU %VMM_256(`0`), (%rdi)
193	VMOVU %VMM_256(`1`), -(`32`)(%rdi, %rdx, CHAR_SIZE)
194	MOVCHAR $`0`, (%rdi, %rdx, CHAR_SIZE)
195	ret
196	# endif
197	.p2align `4`,, `6`
198	L(copy_16_31):
199	/ Use xmm1 explicitly here as it won't require a `vzeroupper`*
200	and will save code size. /*
201	vmovdqu -(`16`)(%rsi, %rdx, CHAR_SIZE), %xmm1
202	VMOVU %VMM_128(`0`), (%rdi)
203	vmovdqu %xmm1, -(`16`)(%rdi, %rdx, CHAR_SIZE)
204	MOVCHAR $`0`, (%rdi, %rdx, CHAR_SIZE)
205	ret
206
207	.p2align `4`,, `2`
208	L(copy_8_15):
209	movq -(`8`)(%rsi, %rdx, CHAR_SIZE), %rcx
210	vmovq %VMM_128(`0`), (%rdi)
211	movq %rcx, -(`8`)(%rdi, %rdx, CHAR_SIZE)
212	MOVCHAR $`0`, (%rdi, %rdx, CHAR_SIZE)
213	ret
214
215	# ifndef USE_AS_WCSCPY
216	.p2align `4`,, `12`
217	L(copy_4_7):
218	movl -(`4`)(%rsi, %rdx, CHAR_SIZE), %ecx
219	vmovd %VMM_128(`0`), (%rdi)
220	movl %ecx, -(`4`)(%rdi, %rdx, CHAR_SIZE)
221	MOVCHAR $`0`, (%rdi, %rdx, CHAR_SIZE)
222	ret
223	# endif
224
225	# endif
226	.p2align `4`,, `4`
227	L(zero_len):
228	# ifdef USE_AS_WCSCPY
229	test %rdx, %rdx
230	# endif
231	jne OVERFLOW_STRCAT
232	ret
233
234	.p2align `4`,, `8`
235	L(more_1x_vec):
236	VMOVU %VMM(`0`), (%rdi)
237
238	/ We are going to align rsi here so will need to be able to re-*
239	adjust rdi/rdx afterwards. NB: We filtered out huge lengths
240	so rsi + rdx CHAR_SIZE cannot overflow. /
241
242	leaq (VEC_SIZE * -`1`)(%rsi, %rdx, CHAR_SIZE), %rdx
243	subq %rsi, %rdi
244	andq $-(VEC_SIZE), %rsi
245	L(loop_last_4x_vec):
246	addq %rsi, %rdi
247	subq %rsi, %rdx
248	# ifdef USE_AS_WCSCPY
249	shrq $`2`, %rdx
250	# endif
251
252	/ Will need this regardless. /
253	VMOVA (VEC_SIZE * `1`)(%rsi), %VMM(`1`)
254	VPTESTN %VMM(`1`), %VMM(`1`), %k0
255	KMOV %k0, %VMASK_REG
256
257	cmpq $(CHAR_PER_VEC * `2`), %rdx
258	ja L(more_2x_vec)
259
260	L(last_2x_vec):
261	FIND_FIRST_ONE (VMASK_REG, VRCX)
262	cmpl %ecx, %edx
263	jbe L(ret_vec_x1_len)
264
265	/ If there were no zero-CHARs (rcx was zero before*
266	FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. /*
267	cmpl $CHAR_PER_VEC, %ecx
268	jne L(ret_vec_x1)
269
270	VMOVA (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
271	VMOVU %VMM(`1`), (VEC_SIZE * `1`)(%rdi)
272	VPTESTN %VMM(`2`), %VMM(`2`), %k0
273	KMOV %k0, %VRCX
274	addl $-CHAR_PER_VEC, %edx
275	bzhi %VRDX, %VRCX, %VR8
276	jz L(ret_vec_x2_len)
277	L(ret_vec_x2):
278	bsf %VRCX, %VRDX
279	L(ret_vec_x2_len):
280	VMOVU (VEC_SIZE * `2` -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(`0`)
281	MOVCHAR $`0`, (VEC_SIZE * `2`)(%rdi, %rdx, CHAR_SIZE)
282	VMOVU %VMM(`0`), (VEC_SIZE * `2` -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
283	ret
284
285	.p2align `4`,, `4`
286	L(ret_vec_x1_len):
287	movl %edx, %ecx
288	L(ret_vec_x1):
289	VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(`0`)
290	MOVCHAR $`0`, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
291	VMOVU %VMM(`0`), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
292	VZEROUPPER_RETURN
293
294
295	.p2align `4`,, `8`
296	L(last_4x_vec):
297	addl $-(CHAR_PER_VEC * `4`), %edx
298	VMOVA (VEC_SIZE * `5`)(%rsi), %VMM(`1`)
299	VPTESTN %VMM(`1`), %VMM(`1`), %k0
300	KMOV %k0, %VMASK_REG
301	subq $-(VEC_SIZE * `4`), %rsi
302	subq $-(VEC_SIZE * `4`), %rdi
303	cmpl $(CHAR_PER_VEC * `2`), %edx
304	jbe L(last_2x_vec)
305	.p2align `4`,, `8`
306	L(more_2x_vec):
307	# ifdef USE_AS_WCSCPY
308	xorl %ecx, %ecx
309	# endif
310	bsf %VMASK_REG, %VRCX
311	jnz L(ret_vec_x1)
312
313	VMOVA (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
314	VMOVU %VMM(`1`), (VEC_SIZE * `1`)(%rdi)
315	VPTESTN %VMM(`2`), %VMM(`2`), %k0
316	KMOV %k0, %VRCX
317	test %VRCX, %VRCX
318	jnz L(ret_vec_x2)
319
320	VMOVA (VEC_SIZE * `3`)(%rsi), %VMM(`3`)
321	VMOVU %VMM(`2`), (VEC_SIZE * `2`)(%rdi)
322	VPTESTN %VMM(`3`), %VMM(`3`), %k0
323	KMOV %k0, %VMASK_REG
324
325	cmpq $(CHAR_PER_VEC * `4`), %rdx
326	ja L(more_4x_vec)
327
328	/ Adjust length before going to L(ret_vec_x3_len) or*
329	L(ret_vec_x3). /*
330	addl $(CHAR_PER_VEC * -`2`), %edx
331
332	FIND_FIRST_ONE (VMASK_REG, VRCX)
333	cmpl %ecx, %edx
334	jbe L(ret_vec_x3_len)
335
336	/ If there were no zero-CHARs (rcx was zero before*
337	FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. /*
338	cmpl $CHAR_PER_VEC, %ecx
339	jne L(ret_vec_x3)
340
341	VMOVA (VEC_SIZE * `4`)(%rsi), %VMM(`4`)
342	VMOVU %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
343	VPTESTN %VMM(`4`), %VMM(`4`), %k0
344	KMOV %k0, %VRCX
345	addl $-CHAR_PER_VEC, %edx
346	bzhi %VRDX, %VRCX, %VR8
347	jz L(ret_vec_x4_len)
348	L(ret_vec_x4):
349	bsf %VRCX, %VRDX
350	L(ret_vec_x4_len):
351	VMOVU (VEC_SIZE * `4` -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(`0`)
352	MOVCHAR $`0`, (VEC_SIZE * `4`)(%rdi, %rdx, CHAR_SIZE)
353	VMOVU %VMM(`0`), (VEC_SIZE * `4` -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
354	ret
355
356	.p2align `4`,, `4`
357	L(ret_vec_x3_len):
358	movl %edx, %ecx
359	L(ret_vec_x3):
360	VMOVU (VEC_SIZE * `3` -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(`0`)
361	MOVCHAR $`0`, (VEC_SIZE * `3`)(%rdi, %rcx, CHAR_SIZE)
362	VMOVU %VMM(`0`), (VEC_SIZE * `3`-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
363	ret
364
365	.p2align `4`,, `8`
366	L(more_4x_vec):
367	# ifdef USE_AS_WCSCPY
368	xorl %ecx, %ecx
369	# endif
370	bsf %VMASK_REG, %VRCX
371	jnz L(ret_vec_x3)
372
373	VMOVA (VEC_SIZE * `4`)(%rsi), %VMM(`4`)
374	VMOVU %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
375	VPTESTN %VMM(`4`), %VMM(`4`), %k0
376	KMOV %k0, %VRCX
377	test %VRCX, %VRCX
378	jnz L(ret_vec_x4)
379
380	VMOVU %VMM(`4`), (VEC_SIZE * `4`)(%rdi)
381
382	/ Check if we are near the end before aligning. /
383	cmpq $(CHAR_PER_VEC * `8`), %rdx
384	jbe L(last_4x_vec)
385
386
387	/ Add rsi to rdx (length) before aligning rsi. NB: Since we*
388	filtered out huge lengths this cannot overflow. /*
389	# ifdef USE_AS_WCSCPY
390	leaq (%rsi, %rdx, CHAR_SIZE), %rdx
391	# else
392	addq %rsi, %rdx
393	# endif
394
395	/ Subtract rsi from rdi before aligning (add back will have*
396	correct rdi for aligned rsi). /*
397	subq %rsi, %rdi
398	subq $-(VEC_SIZE * `5`), %rsi
399	andq $(VEC_SIZE * -`4`), %rsi
400
401	/ Load first half of the loop before entry. /
402	VMOVA (VEC_SIZE * `0` + `0`)(%rsi), %VMM(`0`)
403	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`1`)
404	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`2`)
405	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`3`)
406
407	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
408	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
409	VPTESTN %VMM(`4`), %VMM(`4`), %k2
410	VPTESTN %VMM(`6`), %VMM(`6`), %k4
411
412	/ Offset rsi by VEC_SIZE so that we can jump to*
413	L(loop_last_4x_vec). /*
414	addq $-(VEC_SIZE), %rsi
415	KORTEST %k2, %k4
416	jnz L(loop_4x_done)
417
418	/ Store loop end in r9. /
419	leaq -(VEC_SIZE * `5`)(%rdx), %r9
420
421	.p2align `4`,, `11`
422	L(loop_4x_vec):
423	VMOVU %VMM(`0`), (VEC_SIZE * `1` + `0`)(%rdi, %rsi)
424	VMOVU %VMM(`1`), (VEC_SIZE * `2` + `0`)(%rdi, %rsi)
425	VMOVU %VMM(`2`), (VEC_SIZE * `3` + `0`)(%rdi, %rsi)
426	VMOVU %VMM(`3`), (VEC_SIZE * `4` + `0`)(%rdi, %rsi)
427
428	subq $(VEC_SIZE * -`4`), %rsi
429	cmpq %rsi, %r9
430	jbe L(loop_last_4x_vec)
431
432	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`0`)
433	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`1`)
434	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`2`)
435	VMOVA (VEC_SIZE * `4` + `0`)(%rsi), %VMM(`3`)
436
437	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
438	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
439	VPTESTN %VMM(`4`), %VMM(`4`), %k2
440	VPTESTN %VMM(`6`), %VMM(`6`), %k4
441	KORTEST %k2, %k4
442	jz L(loop_4x_vec)
443
444	L(loop_4x_done):
445	VPTESTN %VMM(`0`), %VMM(`0`), %k0
446	KMOV %k0, %VRCX
447	/ Restore rdi (dst). /
448	addq %rsi, %rdi
449
450	/ L(ret_vec_x1) expects rcx to have position of zero-CHAR so*
451	test with bsf. /*
452	bsf %VRCX, %VRCX
453	jnz L(ret_vec_x1)
454	VMOVU %VMM(`0`), (VEC_SIZE * `1` + `0`)(%rdi)
455
456	KMOV %k2, %VRCX
457	test %VRCX, %VRCX
458	jnz L(ret_vec_x2)
459	VMOVU %VMM(`1`), (VEC_SIZE * `2` + `0`)(%rdi)
460
461	VPTESTN %VMM(`2`), %VMM(`2`), %k0
462	KMOV %k0, %VRCX
463	bsf %VRCX, %VRCX
464	jnz L(ret_vec_x3)
465	VMOVU %VMM(`2`), (VEC_SIZE * `3` + `0`)(%rdi)
466
467	KMOV %k4, %VRCX
468	bsf %VRCX, %VRCX
469	VMOVU ((VEC_SIZE * `4`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(`0`)
470	VMOVU %VMM(`0`), ((VEC_SIZE * `4` + `0`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
471	ret
472
473
474	.p2align `4`,, `4`
475	L(page_cross):
476	movq %rsi, %r8
477	andq $(VEC_SIZE * -`1`), %r8
478	VPCMPEQ (%r8), %VZERO, %k0
479
480	# ifdef USE_AS_WCSCPY
481	KMOV %k0, %VR9
482	shrl $`2`, %ecx
483	andl $(CHAR_PER_VEC - `1`), %ecx
484	shrx %VRCX, %VR9, %VRCX
485	# else
486	KMOV %k0, %VRCX
487	shrx %VRSI, %VRCX, %VRCX
488	# endif
489
490	subl %esi, %r8d
491	andl $(VEC_SIZE - `1`), %r8d
492	# ifdef USE_AS_WCSCPY
493	shrl $`2`, %r8d
494	# endif
495	cmpq %r8, %rdx
496	jbe L(page_cross_small)
497	/ Optimizing more for space as this is very cold code. This*
498	saves 2x cache lines. /*
499
500	/ This adds once to the later result which will get correct*
501	copy bounds. NB: this can never zero-out a non-zero RCX as
502	to be in the page cross case rsi cannot be aligned and we
503	already right-shift rcx by the misalignment. /*
504	shl %VRCX
505	jz L(page_cross_continue)
506	bsf %VRCX, %VRCX
507	REP_MOVS
508	ret
509
510	L(page_cross_small):
511	tzcnt %VRCX, %VRCX
512	jz L(page_cross_setz)
513	cmpl %edx, %ecx
514	cmova %edx, %ecx
515
516	# ifdef USE_AS_WCSCPY
517	rep movsd
518	# else
519	rep movsb
520	# endif
521	L(page_cross_setz):
522	MOVCHAR $`0`, (%rdi)
523	ret
524	END(STRNCAT)
525	#endif
526

source code of glibc/sysdeps/x86_64/multiarch/strncat-evex.S