strcpy-evex.S source code [glibc/sysdeps/x86_64/multiarch/strcpy-evex.S]

1	/ {wcs\|wcp\|str\|stp}cpy with 256/512-bit EVEX instructions.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20	#if ISA_SHOULD_BUILD (4)
21
22
23	/ Use evex-masked stores for small sizes. Turned off at the*
24	moment. /*
25	# define USE_EVEX_MASKED_STORE 0
26	/ Use movsb in page cross case to save code size. /
27	# define USE_MOVSB_IN_PAGE_CROSS 1
28
29	# include <sysdep.h>
30
31	# ifndef VEC_SIZE
32	# include "x86-evex256-vecs.h"
33	# endif
34
35	# ifndef STRCPY
36	# define STRCPY __strcpy_evex
37	# endif
38
39
40	# ifdef USE_AS_WCSCPY
41	# define VMOVU_MASK vmovdqu32
42	# define VPMIN vpminud
43	# define VPTESTN vptestnmd
44	# define VPTEST vptestmd
45	# define VPCMPEQ vpcmpeqd
46	# define CHAR_SIZE 4
47
48	# define REP_MOVS rep movsd
49
50	# define USE_WIDE_CHAR
51	# else
52	# define VMOVU_MASK vmovdqu8
53	# define VPMIN vpminub
54	# define VPTESTN vptestnmb
55	# define VPTEST vptestmb
56	# define VPCMPEQ vpcmpeqb
57	# define CHAR_SIZE 1
58
59	# define REP_MOVS rep movsb
60	# endif
61
62	# include "reg-macros.h"
63
64
65	# ifdef USE_AS_STPCPY
66	# define END_REG rax
67	# else
68	# define END_REG rdi, %rdx, CHAR_SIZE
69	# endif
70
71	# ifdef USE_AS_STRCAT
72	# define PAGE_ALIGN_REG edx
73	# define PAGE_ALIGN_REG_64 rdx
74	# else
75	# define PAGE_ALIGN_REG eax
76	# define PAGE_ALIGN_REG_64 rax
77	# endif
78
79	# define VZERO VMM(7)
80	# define VZERO_128 VMM_128(7)
81
82
83	# define PAGE_SIZE 4096
84	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
85
86
87	.section SECTION(.text), "ax", @progbits
88	ENTRY(STRCPY)
89	# ifdef USE_AS_STRCAT
90	movq %rdi, %rax
91	# include "strcat-strlen-evex.h.S"
92	# endif
93
94	movl %esi, %PAGE_ALIGN_REG
95	andl $(PAGE_SIZE - `1`), %PAGE_ALIGN_REG
96	cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
97	ja L(page_cross)
98	L(page_cross_continue):
99	VMOVU (%rsi), %VMM(`0`)
100	# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
101	movq %rdi, %rax
102	# endif
103
104
105	/ Two short string implementations. One with traditional*
106	branching approach and one with masked instructions (which
107	have potential for dramatically bad perf if dst splits a
108	page and is not in the TLB). /*
109	# if USE_EVEX_MASKED_STORE
110	VPTEST %VMM(`0`), %VMM(`0`), %k0
111	KMOV %k0, %VRCX
112	# ifdef USE_AS_WCSCPY
113	subl $((`1` << CHAR_PER_VEC)- `1`), %VRCX
114	# else
115	inc %VRCX
116	# endif
117	jz L(more_1x_vec)
118	KMOV %VRCX, %k1
119	KXOR %k0, %k1, %k1
120
121	VMOVU_MASK %VMM(`0`), (%rdi){%k1}
122
123	# ifdef USE_AS_STPCPY
124	bsf %VRCX, %VRCX
125	leaq (%rdi, %rcx, CHAR_SIZE), %rax
126	# endif
127	ret
128
129	# else
130	VPTESTN %VMM(`0`), %VMM(`0`), %k0
131	KMOV %k0, %VRCX
132	test %VRCX, %VRCX
133	jz L(more_1x_vec)
134
135	xorl %edx, %edx
136	bsf %VRCX, %VRDX
137	# ifdef USE_AS_STPCPY
138	leaq (%rdi, %rdx, CHAR_SIZE), %rax
139	# endif
140
141	/ Use mask bits in rcx to detect which copy we need. If the low*
142	mask is zero then there must be a bit set in the upper half.
143	I.e if rcx != 0 and ecx == 0, then match must be upper 32
144	bits so we use L(copy_32_63). /*
145	# if VEC_SIZE == 64
146	# ifdef USE_AS_WCSCPY
147	testb %cl, %cl
148	# else
149	testl %ecx, %ecx
150	# endif
151	jz L(copy_32_63)
152	# endif
153
154	# ifdef USE_AS_WCSCPY
155	testb $`0xf`, %cl
156	# else
157	testw %cx, %cx
158	# endif
159	jz L(copy_16_31)
160
161
162	# ifdef USE_AS_WCSCPY
163	testb $`0x3`, %cl
164	# else
165	testb %cl, %cl
166	# endif
167	jz L(copy_8_15)
168
169
170	# ifdef USE_AS_WCSCPY
171	vmovd %VMM_128(`0`), (%rdi)
172	/ No need to copy, we know its zero. /
173	movl $`0`, (%END_REG)
174
175	ret
176	# else
177
178	testb $`0x7`, %cl
179	jz L(copy_4_7)
180
181
182	test %edx, %edx
183	jz L(set_null_term)
184
185	/ NB: make this `vmovw` if support for AVX512-FP16 is added.*
186	*/
187	vmovd %VMM_128(`0`), %esi
188	movw %si, (%rdi)
189
190	.p2align `4`,, `1`
191	L(set_null_term):
192	/ No need to copy, we know its zero. /
193	movb $`0`, (%END_REG)
194	ret
195	# endif
196
197	# if VEC_SIZE == 64
198	.p2align `4`,, `6`
199	L(copy_32_63):
200	VMOVU -(`32` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(`1`)
201	VMOVU %VMM_256(`0`), (%rdi)
202	VMOVU %VMM_256(`1`), -(`32` - CHAR_SIZE)(%END_REG)
203	ret
204	# endif
205
206
207	.p2align `4`,, `6`
208	L(copy_16_31):
209	/ Use xmm1 explicitly here as it won't require a `vzeroupper`*
210	and will save code size. /*
211	vmovdqu -(`16` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
212	VMOVU %VMM_128(`0`), (%rdi)
213	vmovdqu %xmm1, -(`16` - CHAR_SIZE)(%END_REG)
214	ret
215
216	.p2align `4`,, `8`
217	L(copy_8_15):
218	# ifdef USE_AS_WCSCPY
219	movl -(`8` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
220	# else
221	movq -(`8` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
222	# endif
223	vmovq %VMM_128(`0`), (%rdi)
224	movq %rcx, -(`8` - CHAR_SIZE)(%END_REG)
225	ret
226	# endif
227
228
229	# ifndef USE_AS_WCSCPY
230	.p2align `4`,, `12`
231	L(copy_4_7):
232	movl -(`4` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
233	vmovd %VMM_128(`0`), (%rdi)
234	movl %ecx, -(`4` - CHAR_SIZE)(%END_REG)
235	ret
236	# endif
237
238
239	.p2align `4`,, `8`
240	L(more_1x_vec):
241	# if defined USE_AS_STPCPY \|\| defined USE_AS_STRCAT
242	VMOVU %VMM(`0`), (%rdi)
243	# endif
244	subq %rsi, %rdi
245	andq $-(VEC_SIZE), %rsi
246	addq %rsi, %rdi
247	VMOVA (VEC_SIZE * `1`)(%rsi), %VMM(`1`)
248
249	/ Ideally we store after moves to minimize impact of potential*
250	false-dependencies. /*
251	# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
252	VMOVU %VMM(`0`), (%rax)
253	# endif
254
255	VPTESTN %VMM(`1`), %VMM(`1`), %k0
256	KMOV %k0, %VRCX
257	test %VRCX, %VRCX
258	jnz L(ret_vec_x1)
259
260	VMOVA (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
261	VMOVU %VMM(`1`), VEC_SIZE(%rdi)
262
263	VPTESTN %VMM(`2`), %VMM(`2`), %k0
264	KMOV %k0, %VRCX
265	test %VRCX, %VRCX
266	jnz L(ret_vec_x2)
267
268	VMOVA (VEC_SIZE * `3`)(%rsi), %VMM(`3`)
269	VMOVU %VMM(`2`), (VEC_SIZE * `2`)(%rdi)
270
271	VPTESTN %VMM(`3`), %VMM(`3`), %k0
272	KMOV %k0, %VRDX
273	test %VRDX, %VRDX
274	jnz L(ret_vec_x3)
275
276	VMOVA (VEC_SIZE * `4`)(%rsi), %VMM(`4`)
277	VMOVU %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
278	VPTESTN %VMM(`4`), %VMM(`4`), %k0
279	KMOV %k0, %VRCX
280	test %VRCX, %VRCX
281	jnz L(ret_vec_x4)
282
283	VMOVU %VMM(`4`), (VEC_SIZE * `4`)(%rdi)
284
285
286	/ Align for 4x loop. /
287	subq %rsi, %rdi
288
289	/ + VEC_SIZE * 5 because we never added the original VEC_SIZE*
290	we covered before aligning. /*
291	subq $-(VEC_SIZE * `5`), %rsi
292	andq $-(VEC_SIZE * `4`), %rsi
293
294
295	/ Load first half of the loop before entry. /
296	VMOVA (VEC_SIZE * `0` + `0`)(%rsi), %VMM(`0`)
297	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`1`)
298	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`2`)
299	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`3`)
300
301	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
302	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
303	VPTESTN %VMM(`4`), %VMM(`4`), %k2
304	VPTESTN %VMM(`6`), %VMM(`6`), %k4
305	KORTEST %k2, %k4
306	jnz L(loop_4x_done)
307
308	.p2align `4`,, `11`
309	L(loop_4x_vec):
310
311	VMOVU %VMM(`0`), (VEC_SIZE * `0` + `0`)(%rdi, %rsi)
312	VMOVU %VMM(`1`), (VEC_SIZE * `1` + `0`)(%rdi, %rsi)
313	VMOVU %VMM(`2`), (VEC_SIZE * `2` + `0`)(%rdi, %rsi)
314	VMOVU %VMM(`3`), (VEC_SIZE * `3` + `0`)(%rdi, %rsi)
315
316	subq $(VEC_SIZE * -`4`), %rsi
317
318	VMOVA (VEC_SIZE * `0` + `0`)(%rsi), %VMM(`0`)
319	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`1`)
320	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`2`)
321	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`3`)
322
323
324	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
325	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
326	VPTESTN %VMM(`4`), %VMM(`4`), %k2
327	VPTESTN %VMM(`6`), %VMM(`6`), %k4
328	KORTEST %k2, %k4
329	jz L(loop_4x_vec)
330
331	L(loop_4x_done):
332	VPTESTN %VMM(`0`), %VMM(`0`), %k0
333	KMOV %k0, %VRCX
334	/ Restore rdi (%rdi). /
335	addq %rsi, %rdi
336	test %VRCX, %VRCX
337	jnz L(ret_vec_x0_end)
338	VMOVU %VMM(`0`), (VEC_SIZE * `0` + `0`)(%rdi)
339
340	KMOV %k2, %VRCX
341	test %VRCX, %VRCX
342	jnz L(ret_vec_x1)
343	VMOVU %VMM(`1`), (VEC_SIZE * `1` + `0`)(%rdi)
344
345	VPTESTN %VMM(`2`), %VMM(`2`), %k0
346	KMOV %k0, %VRCX
347	test %VRCX, %VRCX
348	jnz L(ret_vec_x2)
349	VMOVU %VMM(`2`), (VEC_SIZE * `2` + `0`)(%rdi)
350	/ Place L(ret_vec_x4) here to save code size. We get a*
351	meaningfuly benefit doing this for stpcpy. /*
352	KMOV %k4, %VRDX
353	L(ret_vec_x3):
354	bsf %VRDX, %VRDX
355	VMOVU ((VEC_SIZE * `3`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(`0`)
356	VMOVU %VMM(`0`), ((VEC_SIZE * `3` + `0`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
357	# ifdef USE_AS_STPCPY
358	leaq (VEC_SIZE * `3` + `0`)(%rdi, %rdx, CHAR_SIZE), %rax
359	# endif
360	L(return_end):
361	ret
362
363	.p2align `4`,, `6`
364	L(ret_vec_x0_end):
365	bsf %VRCX, %VRCX
366	# ifdef USE_AS_STPCPY
367	leaq (%rdi, %rcx, CHAR_SIZE), %rax
368	# endif
369	inc %VRCX
370	VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(`0`)
371	VMOVU %VMM(`0`), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
372	ret
373
374	.p2align `4`,, `8`
375	L(ret_vec_x1):
376	bsf %VRCX, %VRCX
377	VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(`0`)
378	VMOVU %VMM(`0`), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
379	# ifdef USE_AS_STPCPY
380	leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
381	# endif
382	ret
383
384	.p2align `4`,, `4`
385	L(ret_vec_x2):
386	bsf %VRCX, %VRCX
387	VMOVU ((VEC_SIZE * `2`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(`0`)
388	VMOVU %VMM(`0`), ((VEC_SIZE * `2`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
389	# ifdef USE_AS_STPCPY
390	leaq (VEC_SIZE * `2`)(%rdi, %rcx, CHAR_SIZE), %rax
391	# endif
392	ret
393
394	/ ret_vec_x3 reuses return code after the loop. /
395	.p2align `4`,, `6`
396	L(ret_vec_x4):
397	bsf %VRCX, %VRCX
398	VMOVU ((VEC_SIZE * `4`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(`0`)
399	VMOVU %VMM(`0`), ((VEC_SIZE * `4`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
400	# ifdef USE_AS_STPCPY
401	leaq (VEC_SIZE * `4`)(%rdi, %rcx, CHAR_SIZE), %rax
402	# endif
403	ret
404
405
406	.p2align `4`,, `4`
407	L(page_cross):
408	# ifndef USE_AS_STRCAT
409	vpxorq %VZERO_128, %VZERO_128, %VZERO_128
410	# endif
411	movq %rsi, %rcx
412	andq $(VEC_SIZE * -`1`), %rcx
413
414	VPCMPEQ (%rcx), %VZERO, %k0
415	KMOV %k0, %VRCX
416	# ifdef USE_AS_WCSCPY
417	andl $(VEC_SIZE - `1`), %PAGE_ALIGN_REG
418	shrl $`2`, %PAGE_ALIGN_REG
419	# endif
420	shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
421
422	# if USE_MOVSB_IN_PAGE_CROSS
423	/ Optimizing more aggressively for space as this is very cold*
424	code. This saves 2x cache lines. /*
425
426	/ This adds once to the later result which will get correct*
427	copy bounds. NB: this can never zero-out a non-zero RCX as
428	to be in the page cross case rsi cannot be aligned and we
429	already right-shift rcx by the misalignment. /*
430	shl %VRCX
431	jz L(page_cross_continue)
432	# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
433	movq %rdi, %rax
434	# endif
435	bsf %VRCX, %VRCX
436	REP_MOVS
437
438	# ifdef USE_AS_STPCPY
439	leaq -CHAR_SIZE(%rdi), %rax
440	# endif
441	ret
442
443
444	# else
445	/ Check if we found zero-char before end of page. /
446	test %VRCX, %VRCX
447	jz L(page_cross_continue)
448
449	/ Traditional copy case, essentially same as used in non-page-*
450	cross case but since we can't reuse VMM(0) we need twice as
451	many loads from rsi. /*
452
453	# ifndef USE_AS_STRCAT
454	xorl %edx, %edx
455	# endif
456	/ Dependency on rdi must already have been satisfied. /
457	bsf %VRCX, %VRDX
458	# ifdef USE_AS_STPCPY
459	leaq (%rdi, %rdx, CHAR_SIZE), %rax
460	# elif !defined USE_AS_STRCAT
461	movq %rdi, %rax
462	# endif
463
464	# if VEC_SIZE == 64
465	# ifdef USE_AS_WCSCPY
466	testb %cl, %cl
467	# else
468	test %ecx, %ecx
469	# endif
470	jz L(page_cross_copy_32_63)
471	# endif
472
473	# ifdef USE_AS_WCSCPY
474	testb $`0xf`, %cl
475	# else
476	testw %cx, %cx
477	# endif
478	jz L(page_cross_copy_16_31)
479
480	# ifdef USE_AS_WCSCPY
481	testb $`0x3`, %cl
482	# else
483	testb %cl, %cl
484	# endif
485	jz L(page_cross_copy_8_15)
486
487	# ifdef USE_AS_WCSCPY
488	movl (%rsi), %esi
489	movl %esi, (%rdi)
490	movl $`0`, (%END_REG)
491	ret
492	# else
493
494	testb $`0x7`, %cl
495	jz L(page_cross_copy_4_7)
496
497	test %edx, %edx
498	jz L(page_cross_set_null_term)
499	movzwl (%rsi), %ecx
500	movw %cx, (%rdi)
501	L(page_cross_set_null_term):
502	movb $`0`, (%END_REG)
503	ret
504
505
506	.p2align `4`,, `4`
507	L(page_cross_copy_4_7):
508	movl (%rsi), %ecx
509	movl -(`4` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
510	movl %ecx, (%rdi)
511	movl %esi, -(`4` - CHAR_SIZE)(%END_REG)
512	ret
513	# endif
514
515	# if VEC_SIZE == 64
516	.p2align `4`,, `4`
517	L(page_cross_copy_32_63):
518	VMOVU (%rsi), %VMM_256(`0`)
519	VMOVU -(`32` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(`1`)
520	VMOVU %VMM_256(`0`), (%rdi)
521	VMOVU %VMM_256(`1`), -(`32` - CHAR_SIZE)(%END_REG)
522	ret
523	# endif
524
525	.p2align `4`,, `4`
526	L(page_cross_copy_16_31):
527	vmovdqu (%rsi), %xmm0
528	vmovdqu -(`16` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
529	vmovdqu %xmm0, (%rdi)
530	vmovdqu %xmm1, -(`16` - CHAR_SIZE)(%END_REG)
531	ret
532
533	.p2align `4`,, `4`
534	L(page_cross_copy_8_15):
535	movq (%rsi), %rcx
536	movq -(`8` - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
537	movq %rcx, (%rdi)
538	movq %rsi, -(`8` - CHAR_SIZE)(%END_REG)
539	ret
540	# endif
541	END(STRCPY)
542	#endif
543

source code of glibc/sysdeps/x86_64/multiarch/strcpy-evex.S