strncpy-evex.S source code [glibc/sysdeps/x86_64/multiarch/strncpy-evex.S]

1	/ {wcs\|wcp\|str\|stp}ncpy with 256/512-bit EVEX instructions.*
2	Copyright (C) 2022-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (4)
22
23	/ Use evex-masked stores for small sizes. Turned off at the*
24	moment. /*
25	# define USE_EVEX_MASKED_STORE 0
26
27
28	# include <sysdep.h>
29	# ifndef VEC_SIZE
30	# include "x86-evex256-vecs.h"
31	# endif
32
33
34	# ifndef STRNCPY
35	# define STRNCPY __strncpy_evex
36	# endif
37
38	# ifdef USE_AS_WCSCPY
39	# define VMOVU_MASK vmovdqu32
40	# define VPCMPEQ vpcmpeqd
41	# define VPMIN vpminud
42	# define VPTESTN vptestnmd
43	# define VPTEST vptestmd
44	# define CHAR_SIZE 4
45
46	# define REP_MOVS rep movsd
47	# define REP_STOS rep stosl
48
49	# define USE_WIDE_CHAR
50
51	# else
52	# define VMOVU_MASK vmovdqu8
53	# define VPCMPEQ vpcmpeqb
54	# define VPMIN vpminub
55	# define VPTESTN vptestnmb
56	# define VPTEST vptestmb
57	# define CHAR_SIZE 1
58
59	# define REP_MOVS rep movsb
60	# define REP_STOS rep stosb
61	# endif
62
63	# include "strncpy-or-cat-overflow-def.h"
64
65	# define PAGE_SIZE 4096
66	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
67
68	# include "reg-macros.h"
69
70
71	# define VZERO VMM(7)
72	# define VZERO_256 VMM_256(7)
73	# define VZERO_128 VMM_128(7)
74
75	# if VEC_SIZE == 64
76	# define VZERO_HALF VZERO_256
77	# else
78	# define VZERO_HALF VZERO_128
79	# endif
80
81	.section SECTION(.text), "ax", @progbits
82	ENTRY(STRNCPY)
83	# ifdef __ILP32__
84	/ Clear the upper 32 bits. /
85	movl %edx, %edx
86	# endif
87	/ Filter zero length strings and very long strings. Zero*
88	length strings just return, very long strings are handled by
89	just running rep stos{b\|l} to zero set (which will almost
90	certainly segfault), if that succeeds then just calling
91	OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). /*
92	# ifdef USE_AS_WCSCPY
93	decq %rdx
94	movq %rdx, %rax
95	/ 56 is end of max supported address space. /
96	shr $`56`, %rax
97	jnz L(zero_len)
98	# else
99	decq %rdx
100	/ If the flag needs to become `jb` replace `dec` with `sub`.*
101	*/
102	jl L(zero_len)
103	# endif
104
105	vpxorq %VZERO_128, %VZERO_128, %VZERO_128
106	movl %esi, %eax
107	andl $(PAGE_SIZE - `1`), %eax
108	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
109	ja L(page_cross)
110
111	L(page_cross_continue):
112	VMOVU (%rsi), %VMM(`0`)
113	VPTESTN %VMM(`0`), %VMM(`0`), %k0
114	KMOV %k0, %VRCX
115
116	/ If no STPCPY just save end ahead of time. /
117	# ifndef USE_AS_STPCPY
118	movq %rdi, %rax
119	# endif
120
121
122	cmpq $(CHAR_PER_VEC), %rdx
123
124	/ If USE_EVEX_MASK_STORE is enabled then we just handle length*
125	<= CHAR_PER_VEC with masked instructions (which have
126	potential for dramatically bad perf if dst splits a page and
127	is not in the TLB). /*
128	# if USE_EVEX_MASKED_STORE
129	/ `jae` because length rdx is now length - 1. /
130	jae L(more_1x_vec)
131
132	/ If there where multiple zero-CHAR matches in the first VEC,*
133	VRCX will be overset but that's fine since any oversets where
134	at zero-positions anyways. /*
135
136	# ifdef USE_AS_STPCPY
137	tzcnt %VRCX, %VRAX
138	cmpl %eax, %edx
139	cmovb %edx, %eax
140	# ifdef USE_AS_WCSCPY
141	adcl $`0`, %eax
142	leaq (%rdi, %rax, CHAR_SIZE), %rax
143	# else
144	adcq %rdi, %rax
145	# endif
146	# endif
147	dec %VRCX
148
149	/ Zero out all non-zero CHAR's after the first zero match. /
150	KMOV %VRCX, %k1
151
152	/ Use VZERO as destination so this can be reused for*
153	L(zfill_less_vec) (which if jumped to by subsequent logic
154	will have zerod out VZERO. /*
155	VMOVU_MASK %VMM(`0`), %VZERO{%k1}{z}
156	L(zfill_less_vec):
157	/ Get mask for what we need to set. /
158	incl %edx
159	mov $-`1`, %VRCX
160	bzhi %VRDX, %VRCX, %VRCX
161	KMOV %VRCX, %k1
162	VMOVU_MASK %VZERO, (%rdi){%k1}
163	ret
164
165	.p2align `4`,, `4`
166	L(zero_len):
167	cmpq $-`1`, %rdx
168	jne L(best_effort_strncpy)
169	movq %rdi, %rax
170	ret
171
172	.p2align `4`,, `8`
173	L(more_1x_vec):
174	# else
175	/ `jb` because length rdx is now length - 1. /
176	jb L(less_1x_vec)
177	# endif
178
179
180	/ This may overset but that's fine because we still need to zero*
181	fill. /*
182	VMOVU %VMM(`0`), (%rdi)
183
184
185	/ Length must be >= CHAR_PER_VEC so match here means we must*
186	zero-fill. /*
187	test %VRCX, %VRCX
188	jnz L(zfill)
189
190
191	/ We are going to align rsi here so will need to be able to re-*
192	adjust rdi/rdx afterwards. NB: We filtered out huge lengths
193	so rsi + rdx CHAR_SIZE cannot overflow. /
194	leaq (VEC_SIZE * -`1`)(%rsi, %rdx, CHAR_SIZE), %rdx
195	subq %rsi, %rdi
196	andq $-(VEC_SIZE), %rsi
197
198	L(loop_last_4x_vec):
199	addq %rsi, %rdi
200	subq %rsi, %rdx
201	# ifdef USE_AS_WCSCPY
202	shrq $`2`, %rdx
203	# endif
204
205	VMOVA (VEC_SIZE * `1`)(%rsi), %VMM(`1`)
206	VPTESTN %VMM(`1`), %VMM(`1`), %k0
207	KMOV %k0, %VRCX
208
209	/ -1 because of the `dec %rdx` earlier. /
210	cmpq $(CHAR_PER_VEC * `2` - `1`), %rdx
211	ja L(more_2x_vec)
212
213	L(last_2x_vec):
214	/ This will be need to be computed no matter what. We do it*
215	ahead of time for CHAR_PER_VEC == 64 because we can't adjust
216	the value of `tzcnt` with a shift. /*
217	# if CHAR_PER_VEC == 64
218	tzcntq %rcx, %rcx
219	# endif
220
221	cmpl $(CHAR_PER_VEC), %edx
222	jb L(ret_vec_x1_len)
223
224	/ Separate logic for CHAR_PER_VEC == 64 because we already did*
225	`tzcnt` on VRCX. /*
226	# if CHAR_PER_VEC == 64
227	/ cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. /
228	cmpb $CHAR_PER_VEC, %cl
229	jnz L(ret_vec_x1_no_bsf)
230	# else
231	test %VRCX, %VRCX
232	jnz L(ret_vec_x1)
233	# endif
234
235
236
237	VPCMPEQ (VEC_SIZE * `2`)(%rsi), %VZERO, %k0
238	VMOVU %VMM(`1`), (VEC_SIZE * `1`)(%rdi)
239	KMOV %k0, %VRCX
240
241	# if CHAR_PER_VEC < 64
242	/ This essentiallys adds CHAR_PER_VEC to computed result. /
243	shlq $CHAR_PER_VEC, %rcx
244	# else
245	tzcntq %rcx, %rcx
246	addl $CHAR_PER_VEC, %ecx
247	# endif
248
249	.p2align `4`,, `4`
250	L(ret_vec_x1_len):
251	/ If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has*
252	already been done. /*
253	# if CHAR_PER_VEC < 64
254	tzcntq %rcx, %rcx
255	# endif
256	cmpl %ecx, %edx
257	jbe L(ret_vec_x1_len_no_zfill)
258	/ Fall through (expectation) is copy len < buffer len. /
259	VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
260	L(ret_vec_x1_len_no_zfill_mov):
261	movl %ecx, %edx
262	# ifdef USE_AS_STPCPY
263	/ clear flags. /
264	xorl %ecx, %ecx
265	# endif
266	L(ret_vec_x1_len_no_zfill):
267	VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(`0`)
268	VMOVU %VMM(`0`), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
269	# ifdef USE_AS_STPCPY
270	# ifdef USE_AS_WCSCPY
271	adcq $`0`, %rdx
272	leaq (VEC_SIZE * `1`)(%rdi, %rdx, CHAR_SIZE), %rax
273	# else
274	leal (VEC_SIZE)(%rdx), %eax
275	adcq %rdi, %rax
276	# endif
277	# endif
278	ret
279
280
281	.p2align `4`,, `10`
282	L(ret_vec_x1):
283	bsf %VRCX, %VRCX
284	L(ret_vec_x1_no_bsf):
285	VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
286	subl %ecx, %edx
287	cmpl $CHAR_PER_VEC, %edx
288	jb L(ret_vec_x1_len_no_zfill_mov)
289	/ Fall through (expectation) is copy len < buffer len. /
290	VMOVU %VMM(`1`), (VEC_SIZE * `1`)(%rdi)
291	VMOVU %VZERO, (VEC_SIZE * `1`)(%rdi, %rcx, CHAR_SIZE)
292	# ifdef USE_AS_STPCPY
293	leaq (VEC_SIZE * `1`)(%rdi, %rcx, CHAR_SIZE), %rax
294	# endif
295	ret
296
297	.p2align `4`,, `8`
298	L(last_4x_vec):
299	/ Separate logic for CHAR_PER_VEC == 64 because we can do `andl*
300	$(CHAR_PER_VEC 4 - 1), %edx` with less code size just*
301	using `movzbl`. /*
302	# if CHAR_PER_VEC == 64
303	movzbl %dl, %edx
304	# else
305	andl $(CHAR_PER_VEC * `4` - `1`), %edx
306	# endif
307	VMOVA (VEC_SIZE * `5`)(%rsi), %VMM(`1`)
308	VPTESTN %VMM(`1`), %VMM(`1`), %k0
309	KMOV %k0, %VRCX
310	subq $-(VEC_SIZE * `4`), %rsi
311	subq $-(VEC_SIZE * `4`), %rdi
312	cmpl $(CHAR_PER_VEC * `2` - `1`), %edx
313	jbe L(last_2x_vec)
314	.p2align `4`,, `8`
315	L(more_2x_vec):
316	VMOVU %VMM(`1`), (VEC_SIZE * `1`)(%rdi)
317	test %VRCX, %VRCX
318	/ Must fill at least 2x VEC. /
319	jnz L(zfill_vec1)
320
321	VMOVA (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
322	VMOVU %VMM(`2`), (VEC_SIZE * `2`)(%rdi)
323	VPTESTN %VMM(`2`), %VMM(`2`), %k0
324	KMOV %k0, %VRCX
325	test %VRCX, %VRCX
326	/ Must fill at least 1x VEC. /
327	jnz L(zfill_vec2)
328
329	VMOVA (VEC_SIZE * `3`)(%rsi), %VMM(`3`)
330	VPTESTN %VMM(`3`), %VMM(`3`), %k0
331	KMOV %k0, %VRCX
332
333	/ Check if len is more 4x VEC. -1 because rdx is len - 1. /
334	cmpq $(CHAR_PER_VEC * `4` - `1`), %rdx
335	ja L(more_4x_vec)
336
337	subl $(CHAR_PER_VEC * `3`), %edx
338	jb L(ret_vec_x3_len)
339
340	test %VRCX, %VRCX
341	jnz L(ret_vec_x3)
342
343	VPCMPEQ (VEC_SIZE * `4`)(%rsi), %VZERO, %k0
344	VMOVU %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
345	KMOV %k0, %VRCX
346	tzcnt %VRCX, %VRCX
347	cmpl %ecx, %edx
348	jbe L(ret_vec_x4_len_no_zfill)
349	/ Fall through (expectation) is copy len < buffer len. /
350	VMOVU %VZERO, ((VEC_SIZE * `4`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
351	movl %ecx, %edx
352	L(ret_vec_x4_len_no_zfill):
353	VMOVU ((VEC_SIZE * `4`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(`0`)
354	VMOVU %VMM(`0`), ((VEC_SIZE * `4`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
355	# ifdef USE_AS_STPCPY
356	# ifdef USE_AS_WCSCPY
357	adcq $`0`, %rdx
358	leaq (VEC_SIZE * `4`)(%rdi, %rdx, CHAR_SIZE), %rax
359	# else
360	leal (VEC_SIZE * `4` + `0`)(%rdx), %eax
361	adcq %rdi, %rax
362	# endif
363	# endif
364	ret
365
366
367	L(ret_vec_x3_len):
368	addl $(CHAR_PER_VEC * `1`), %edx
369	tzcnt %VRCX, %VRCX
370	cmpl %ecx, %edx
371	jbe L(ret_vec_x3_len_no_zfill)
372	/ Fall through (expectation) is copy len < buffer len. /
373	VMOVU %VZERO, ((VEC_SIZE * `3`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
374	L(ret_vec_x3_len_no_zfill_mov):
375	movl %ecx, %edx
376	# ifdef USE_AS_STPCPY
377	/ clear flags. /
378	xorl %ecx, %ecx
379	# endif
380	.p2align `4`,, `4`
381	L(ret_vec_x3_len_no_zfill):
382	VMOVU ((VEC_SIZE * `3`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(`0`)
383	VMOVU %VMM(`0`), ((VEC_SIZE * `3`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
384	# ifdef USE_AS_STPCPY
385	# ifdef USE_AS_WCSCPY
386	adcq $`0`, %rdx
387	leaq (VEC_SIZE * `3`)(%rdi, %rdx, CHAR_SIZE), %rax
388	# else
389	leal (VEC_SIZE * `3` + `0`)(%rdx), %eax
390	adcq %rdi, %rax
391	# endif
392	# endif
393	ret
394
395
396	.p2align `4`,, `8`
397	L(ret_vec_x3):
398	bsf %VRCX, %VRCX
399	VMOVU %VZERO, (VEC_SIZE * `4` +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
400	subl %ecx, %edx
401	jl L(ret_vec_x3_len_no_zfill_mov)
402	VMOVU %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
403	VMOVU %VZERO, (VEC_SIZE * `3`)(%rdi, %rcx, CHAR_SIZE)
404	# ifdef USE_AS_STPCPY
405	leaq (VEC_SIZE * `3`)(%rdi, %rcx, CHAR_SIZE), %rax
406	# endif
407	ret
408
409	.p2align `4`,, `8`
410	L(more_4x_vec):
411	VMOVU %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
412	test %VRCX, %VRCX
413	jnz L(zfill_vec3)
414
415	VMOVA (VEC_SIZE * `4`)(%rsi), %VMM(`4`)
416	VMOVU %VMM(`4`), (VEC_SIZE * `4`)(%rdi)
417	VPTESTN %VMM(`4`), %VMM(`4`), %k0
418	KMOV %k0, %VRCX
419	test %VRCX, %VRCX
420	jnz L(zfill_vec4)
421
422	/ Recheck length before aligning. /
423	cmpq $(CHAR_PER_VEC * `8` - `1`), %rdx
424	jbe L(last_4x_vec)
425
426	/ Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. /
427	# ifdef USE_AS_WCSCPY
428	leaq (%rsi, %rdx, CHAR_SIZE), %rdx
429	# else
430	addq %rsi, %rdx
431	# endif
432	subq %rsi, %rdi
433	subq $-(VEC_SIZE * `5`), %rsi
434	andq $(VEC_SIZE * -`4`), %rsi
435
436
437	/ Load first half of the loop before entry. /
438	VMOVA (VEC_SIZE * `0` + `0`)(%rsi), %VMM(`0`)
439	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`1`)
440	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`2`)
441	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`3`)
442
443	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
444	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
445	VPTESTN %VMM(`4`), %VMM(`4`), %k2
446	VPTESTN %VMM(`6`), %VMM(`6`), %k4
447
448
449	/ Offset rsi by VEC_SIZE so that we can jump to*
450	L(loop_last_4x_vec). /*
451	addq $-(VEC_SIZE), %rsi
452	KORTEST %k2, %k4
453	jnz L(loop_4x_done)
454
455	/ Store loop end in r9. /
456	leaq -(VEC_SIZE * `5` - CHAR_SIZE)(%rdx), %r9
457
458	.p2align `4`,, `11`
459	L(loop_4x_vec):
460	VMOVU %VMM(`0`), (VEC_SIZE * `1` + `0`)(%rdi, %rsi)
461	VMOVU %VMM(`1`), (VEC_SIZE * `2` + `0`)(%rdi, %rsi)
462	VMOVU %VMM(`2`), (VEC_SIZE * `3` + `0`)(%rdi, %rsi)
463	VMOVU %VMM(`3`), (VEC_SIZE * `4` + `0`)(%rdi, %rsi)
464
465	subq $(VEC_SIZE * -`4`), %rsi
466	cmpq %rsi, %r9
467	jbe L(loop_last_4x_vec)
468
469	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`0`)
470	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`1`)
471	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`2`)
472	VMOVA (VEC_SIZE * `4` + `0`)(%rsi), %VMM(`3`)
473
474	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
475	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
476	VPTESTN %VMM(`4`), %VMM(`4`), %k2
477	VPTESTN %VMM(`6`), %VMM(`6`), %k4
478	KORTEST %k2, %k4
479	jz L(loop_4x_vec)
480
481	L(loop_4x_done):
482	/ Restore rdx (length). /
483	subq %rsi, %rdx
484	# ifdef USE_AS_WCSCPY
485	shrq $`2`, %rdx
486	# endif
487	VMOVU %VMM(`0`), (VEC_SIZE * `1` + `0`)(%rdi, %rsi)
488	/ Restore rdi (dst). /
489	addq %rsi, %rdi
490	VPTESTN %VMM(`0`), %VMM(`0`), %k0
491	KMOV %k0, %VRCX
492	test %VRCX, %VRCX
493	jnz L(zfill_vec1)
494
495	VMOVU %VMM(`1`), (VEC_SIZE * `2` + `0`)(%rdi)
496	KMOV %k2, %VRCX
497	test %VRCX, %VRCX
498	jnz L(zfill_vec2)
499
500	VMOVU %VMM(`2`), (VEC_SIZE * `3` + `0`)(%rdi)
501	VPTESTN %VMM(`2`), %VMM(`2`), %k0
502	KMOV %k0, %VRCX
503	test %VRCX, %VRCX
504	jnz L(zfill_vec3)
505
506	VMOVU %VMM(`3`), (VEC_SIZE * `4` + `0`)(%rdi)
507	KMOV %k4, %VRCX
508	// Zfill more....
509
510	.p2align `4`,, `4`
511	L(zfill_vec4):
512	subq $(VEC_SIZE * -`2`), %rdi
513	addq $(CHAR_PER_VEC * -`2`), %rdx
514	L(zfill_vec2):
515	subq $(VEC_SIZE * -`2`), %rdi
516	addq $(CHAR_PER_VEC * -`1`), %rdx
517	L(zfill):
518	/ VRCX must be non-zero. /
519	bsf %VRCX, %VRCX
520
521	/ Adjust length / dst for zfill. /
522	subq %rcx, %rdx
523	# ifdef USE_AS_WCSCPY
524	leaq (%rdi, %rcx, CHAR_SIZE), %rdi
525	# else
526	addq %rcx, %rdi
527	# endif
528	# ifdef USE_AS_STPCPY
529	movq %rdi, %rax
530	# endif
531	L(zfill_from_page_cross):
532
533	/ From here on out its just memset(rdi, 0, rdx). /
534	cmpq $CHAR_PER_VEC, %rdx
535	jb L(zfill_less_vec)
536
537	L(zfill_more_1x_vec):
538	VMOVU %VZERO, (%rdi)
539	VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
540	cmpq $(CHAR_PER_VEC * `2` - `1`), %rdx
541	ja L(zfill_more_2x_vec)
542	L(zfill_done0):
543	ret
544
545	/ Coming from vec1/vec2 we must be able to zfill at least 2x*
546	VEC. /*
547	.p2align `4`,, `8`
548	L(zfill_vec3):
549	subq $(VEC_SIZE * -`2`), %rdi
550	addq $(CHAR_PER_VEC * -`2`), %rdx
551	.p2align `4`,, `2`
552	L(zfill_vec1):
553	bsfq %rcx, %rcx
554	/ rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.*
555	*/
556	leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
557	subq %rcx, %rdx
558	# ifdef USE_AS_STPCPY
559	movq %rdi, %rax
560	# endif
561
562
563	VMOVU %VZERO, (%rdi)
564	VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
565	cmpq $(CHAR_PER_VEC * `2`), %rdx
566	jb L(zfill_done0)
567	L(zfill_more_2x_vec):
568	VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * `2`)(%rdi, %rdx, CHAR_SIZE)
569	VMOVU %VZERO, (VEC_SIZE)(%rdi)
570	subq $(CHAR_PER_VEC * `4` - `1`), %rdx
571	jbe L(zfill_done)
572
573	# ifdef USE_AS_WCSCPY
574	leaq (%rdi, %rdx, CHAR_SIZE), %rdx
575	# else
576	addq %rdi, %rdx
577	# endif
578
579	VMOVU %VZERO, (VEC_SIZE * `2`)(%rdi)
580	VMOVU %VZERO, (VEC_SIZE * `3`)(%rdi)
581
582
583	VMOVU %VZERO, (VEC_SIZE * `0` + `0`)(%rdx)
584	VMOVU %VZERO, (VEC_SIZE * `1` + `0`)(%rdx)
585
586	subq $-(VEC_SIZE * `4`), %rdi
587	cmpq %rdi, %rdx
588	jbe L(zfill_done)
589
590	/ Align rdi and zfill loop. /
591	andq $-(VEC_SIZE), %rdi
592	.p2align `4`,, `12`
593	L(zfill_loop_4x_vec):
594	VMOVA %VZERO, (VEC_SIZE * `0`)(%rdi)
595	VMOVA %VZERO, (VEC_SIZE * `1`)(%rdi)
596	VMOVA %VZERO, (VEC_SIZE * `2`)(%rdi)
597	VMOVA %VZERO, (VEC_SIZE * `3`)(%rdi)
598	subq $-(VEC_SIZE * `4`), %rdi
599	cmpq %rdi, %rdx
600	ja L(zfill_loop_4x_vec)
601	L(zfill_done):
602	ret
603
604
605	/ Less 1x VEC case if we are not using evex masked store. /
606	# if !USE_EVEX_MASKED_STORE
607	.p2align `4`,, `8`
608	L(copy_1x):
609	/ Special case for copy 1x. It can be handled quickly and many*
610	buffer sizes have convenient alignment. /*
611	VMOVU %VMM(`0`), (%rdi)
612	/ If no zeros then we are done. /
613	testl %ecx, %ecx
614	jz L(ret_1x_1x)
615
616	/ Need to zfill, not we know that length <= CHAR_PER_VEC so we*
617	only handle the small case here. /*
618	bsf %VRCX, %VRCX
619	L(zfill_less_vec_no_bsf):
620	/ Adjust length / dst then just zfill less_vec. /
621	subq %rcx, %rdx
622	# ifdef USE_AS_WCSCPY
623	leaq (%rdi, %rcx, CHAR_SIZE), %rdi
624	# else
625	addq %rcx, %rdi
626	# endif
627	# ifdef USE_AS_STPCPY
628	movq %rdi, %rax
629	# endif
630
631	L(zfill_less_vec):
632	cmpl $((VEC_SIZE / `2`) / CHAR_SIZE), %edx
633	jb L(zfill_less_half)
634
635	VMOVU %VZERO_HALF, (%rdi)
636	VMOVU %VZERO_HALF, -((VEC_SIZE / `2`)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
637	ret
638	# ifdef USE_AS_STPCPY
639	L(ret_1x_1x):
640	leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
641	ret
642	# endif
643
644
645	# if VEC_SIZE == 64
646	.p2align `4`,, `4`
647	L(copy_32_63):
648	/ Overfill to avoid branches. /
649	VMOVU -(`32` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(`1`)
650	VMOVU %VMM_256(`0`), (%rdi)
651	VMOVU %VMM_256(`1`), -(`32` - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
652
653	/ We are taking advantage of the fact that to be here we must*
654	be writing null-term as (%rdi, %rcx) we have a byte of lee-
655	way for overwriting. /*
656	cmpl %ecx, %edx
657	ja L(zfill_less_vec_no_bsf)
658	# ifndef USE_AS_STPCPY
659	L(ret_1x_1x):
660	# else
661	# ifdef USE_AS_WCSCPY
662	adcq $`0`, %rdx
663	leaq (%rdi, %rdx, CHAR_SIZE), %rax
664	# else
665	movl %edx, %eax
666	adcq %rdi, %rax
667	# endif
668	# endif
669	ret
670	# endif
671
672	.p2align `4`,, `4`
673	L(copy_16_31):
674	/ Overfill to avoid branches. /
675	vmovdqu -(`16` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
676	VMOVU %VMM_128(`0`), (%rdi)
677	vmovdqu %xmm1, -(`16` - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
678	cmpl %ecx, %edx
679
680	/ Separate logic depending on VEC_SIZE. If VEC_SIZE == 64 then*
681	we have a larger copy block for 32-63 so this is just falls
682	through to zfill 16-31. If VEC_SIZE == 32 then we check for
683	full zfill of less 1x VEC. /*
684	# if VEC_SIZE == 64
685	jbe L(ret_16_31)
686	subl %ecx, %edx
687	# ifdef USE_AS_WCSCPY
688	leaq (%rdi, %rcx, CHAR_SIZE), %rdi
689	# else
690	addq %rcx, %rdi
691	# endif
692	# ifdef USE_AS_STPCPY
693	movq %rdi, %rax
694	# endif
695	L(zfill_less_half):
696	L(zfill_less_32):
697	cmpl $(`16` / CHAR_SIZE), %edx
698	jb L(zfill_less_16)
699	VMOVU %VZERO_128, (%rdi)
700	VMOVU %VZERO_128, -(`16` - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
701	# ifdef USE_AS_STPCPY
702	ret
703	# endif
704	L(ret_16_31):
705	# ifdef USE_AS_STPCPY
706	# ifdef USE_AS_WCSCPY
707	adcq $`0`, %rdx
708	leaq (%rdi, %rdx, CHAR_SIZE), %rax
709	# else
710	movl %edx, %eax
711	adcq %rdi, %rax
712	# endif
713	# endif
714	ret
715	# else
716	/ VEC_SIZE == 32 begins. /
717	ja L(zfill_less_vec_no_bsf)
718	# ifndef USE_AS_STPCPY
719	L(ret_1x_1x):
720	# else
721	# ifdef USE_AS_WCSCPY
722	adcq $`0`, %rdx
723	leaq (%rdi, %rdx, CHAR_SIZE), %rax
724	# else
725	movl %edx, %eax
726	adcq %rdi, %rax
727	# endif
728	# endif
729	ret
730	# endif
731
732
733	.p2align `4`,, `4`
734	L(copy_8_15):
735	/ Overfill to avoid branches. /
736	movq -(`8` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
737	vmovq %VMM_128(`0`), (%rdi)
738	movq %rsi, -(`8` - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
739	cmpl %ecx, %edx
740	jbe L(ret_8_15)
741	subl %ecx, %edx
742	# ifdef USE_AS_WCSCPY
743	leaq (%rdi, %rcx, CHAR_SIZE), %rdi
744	# else
745	addq %rcx, %rdi
746	# endif
747	# ifdef USE_AS_STPCPY
748	movq %rdi, %rax
749	# endif
750	.p2align `4`,, `8`
751	# if VEC_SIZE == 32
752	L(zfill_less_half):
753	# endif
754	L(zfill_less_16):
755	xorl %ecx, %ecx
756	cmpl $(`8` / CHAR_SIZE), %edx
757	jb L(zfill_less_8)
758	movq %rcx, (%rdi)
759	movq %rcx, -(`8` - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
760	# ifndef USE_AS_STPCPY
761	L(ret_8_15):
762	# endif
763	ret
764
765	.p2align `4`,, `8`
766	L(less_1x_vec):
767	je L(copy_1x)
768
769	/ We will need `tzcnt` result for all other copy sizes. /
770	tzcnt %VRCX, %VRCX
771	# if VEC_SIZE == 64
772	cmpl $(`32` / CHAR_SIZE), %edx
773	jae L(copy_32_63)
774	# endif
775
776	cmpl $(`16` / CHAR_SIZE), %edx
777	jae L(copy_16_31)
778
779	cmpl $(`8` / CHAR_SIZE), %edx
780	jae L(copy_8_15)
781	# ifdef USE_AS_WCSCPY
782	testl %ecx, %ecx
783	jz L(zfill_less_8_set_ret)
784
785	movl (%rsi, %rdx, CHAR_SIZE), %esi
786	vmovd %VMM_128(`0`), (%rdi)
787	movl %esi, (%rdi, %rdx, CHAR_SIZE)
788	# ifdef USE_AS_STPCPY
789	cmpl %ecx, %edx
790	L(ret_8_15):
791	adcq $`0`, %rdx
792	leaq (%rdi, %rdx, CHAR_SIZE), %rax
793	# endif
794	ret
795	L(zfill_less_8_set_ret):
796	xorl %ecx, %ecx
797	# ifdef USE_AS_STPCPY
798	movq %rdi, %rax
799	# endif
800	L(zfill_less_8):
801	movl %ecx, (%rdi)
802	movl %ecx, (%rdi, %rdx, CHAR_SIZE)
803	ret
804	# else
805	cmpl $`3`, %edx
806	jb L(copy_0_3)
807	/ Overfill to avoid branches. /
808	movl -`3`(%rsi, %rdx), %esi
809	vmovd %VMM_128(`0`), (%rdi)
810	movl %esi, -`3`(%rdi, %rdx)
811	cmpl %ecx, %edx
812	jbe L(ret_4_7)
813	subq %rcx, %rdx
814	addq %rcx, %rdi
815	# ifdef USE_AS_STPCPY
816	movq %rdi, %rax
817	# endif
818	xorl %ecx, %ecx
819	.p2align `4`,, `8`
820	L(zfill_less_8):
821	cmpl $`3`, %edx
822	jb L(zfill_less_3)
823	movl %ecx, (%rdi)
824	movl %ecx, -`3`(%rdi, %rdx)
825	# ifdef USE_AS_STPCPY
826	ret
827	# endif
828
829	L(ret_4_7):
830	# ifdef USE_AS_STPCPY
831	L(ret_8_15):
832	movl %edx, %eax
833	adcq %rdi, %rax
834	# endif
835	ret
836
837	.p2align `4`,, `4`
838	L(zfill_less_3):
839	testl %edx, %edx
840	jz L(zfill_1)
841	movw %cx, (%rdi)
842	L(zfill_1):
843	movb %cl, (%rdi, %rdx)
844	ret
845
846	.p2align `4`,, `8`
847	L(copy_0_3):
848	vmovd %VMM_128(`0`), %r8d
849	testl %edx, %edx
850	jz L(copy_1)
851	movw %r8w, (%rdi)
852	cmpl %ecx, %edx
853	ja L(zfill_from_1)
854	movzbl (%rsi, %rdx), %r8d
855	# ifdef USE_AS_STPCPY
856	movl %edx, %eax
857	adcq %rdi, %rax
858	movb %r8b, (%rdi, %rdx)
859	ret
860	# endif
861
862	L(copy_1):
863	# ifdef USE_AS_STPCPY
864	movl %edx, %eax
865	cmpl %ecx, %edx
866	adcq %rdi, %rax
867	# endif
868	# ifdef USE_AS_WCSCPY
869	vmovd %VMM_128(`0`), (%rdi)
870	# else
871	movb %r8b, (%rdi, %rdx)
872	# endif
873	ret
874	# endif
875
876
877	# ifndef USE_AS_WCSCPY
878	.p2align `4`,, `8`
879	L(zfill_from_1):
880	# ifdef USE_AS_STPCPY
881	leaq (%rdi, %rcx), %rax
882	# endif
883	movw $`0`, -`1`(%rdi, %rdx)
884	ret
885	# endif
886
887	.p2align `4`,, `4`
888	L(zero_len):
889	incq %rdx
890	jne L(best_effort_strncpy)
891	movq %rdi, %rax
892	ret
893	# endif
894
895
896	.p2align `4`,, `4`
897	.p2align `6`,, `8`
898	L(page_cross):
899	movq %rsi, %rax
900	andq $(VEC_SIZE * -`1`), %rax
901	VPCMPEQ (%rax), %VZERO, %k0
902	KMOV %k0, %VRCX
903	# ifdef USE_AS_WCSCPY
904	movl %esi, %r8d
905	shrl $`2`, %r8d
906	andl $(CHAR_PER_VEC - `1`), %r8d
907	shrx %VR8, %VRCX, %VRCX
908	# else
909	shrx %VRSI, %VRCX, %VRCX
910	# endif
911
912	/ Compute amount of bytes we checked. /
913	subl %esi, %eax
914	andl $(VEC_SIZE - `1`), %eax
915	# ifdef USE_AS_WCSCPY
916	shrl $`2`, %eax
917	# endif
918
919	/ If rax > rdx then we are finishing the copy at the end of the*
920	page. /*
921	cmpq %rax, %rdx
922	jb L(page_cross_small)
923
924
925	/ If rcx is non-zero then continue. /
926	test %VRCX, %VRCX
927	jz L(page_cross_continue)
928
929	/ We found zero-CHAR so need to copy then zfill (we know we*
930	didn't cover all of length here). /*
931	bsf %VRCX, %VRCX
932	L(movsb_and_zfill):
933	incl %ecx
934	subq %rcx, %rdx
935	# ifdef USE_AS_STPCPY
936	leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
937	# else
938	movq %rdi, %rax
939	# endif
940
941	REP_MOVS
942	# ifdef USE_AS_WCSCPY
943	movl $`0`, (%rdi)
944	# else
945	movb $`0`, (%rdi)
946	# endif
947	jmp L(zfill_from_page_cross)
948
949	L(page_cross_small):
950	tzcnt %VRCX, %VRCX
951	cmpl %ecx, %edx
952	jbe L(page_cross_copy_only)
953
954	/ Do a zfill of the tail before copying. /
955	movq %rdi, %r9
956	xorl %eax, %eax
957
958	movl %ecx, %r8d
959
960	subl %ecx, %edx
961	leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
962	movl %edx, %ecx
963	REP_STOS
964	movq %r9, %rdi
965	movl %r8d, %edx
966	L(page_cross_copy_only):
967	leal `1`(%rdx), %ecx
968	# ifdef USE_AS_STPCPY
969	# ifdef USE_AS_WCSCPY
970	adcl $`0`, %edx
971	leaq (%rdi, %rdx, CHAR_SIZE), %rax
972	# else
973	movl %edx, %eax
974	adcq %rdi, %rax
975	# endif
976	# else
977	movq %rdi, %rax
978	# endif
979	REP_MOVS
980	ret
981
982
983	L(best_effort_strncpy):
984	movq %rdx, %rcx
985	xorl %eax, %eax
986	movq %rdi, %r8
987	/ The length is >= 2^63. We very much so expect to segfault at*
988	rep stos. If that doesn't happen then just strcpy to finish.
989	*/
990	REP_STOS
991	movq %r8, %rdi
992	jmp OVERFLOW_STRCPY
993	END(STRNCPY)
994	#endif
995

source code of glibc/sysdeps/x86_64/multiarch/strncpy-evex.S