1/* strcpy with AVX2
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# ifndef USE_AS_STRCAT
22# include <sysdep.h>
23
24# ifndef STRCPY
25# define STRCPY __strcpy_avx2
26# endif
27
28# endif
29
30/* Number of bytes in a vector register */
31# ifndef VEC_SIZE
32# define VEC_SIZE 32
33# endif
34
35# ifndef VZEROUPPER
36# define VZEROUPPER vzeroupper
37# endif
38
39# ifndef SECTION
40# define SECTION(p) p##.avx
41# endif
42
43/* zero register */
44#define xmmZ xmm0
45#define ymmZ ymm0
46
47/* mask register */
48#define ymmM ymm1
49
50# ifndef USE_AS_STRCAT
51
52 .section SECTION(.text),"ax",@progbits
53ENTRY (STRCPY)
54# ifdef USE_AS_STRNCPY
55 mov %RDX_LP, %R8_LP
56 test %R8_LP, %R8_LP
57 jz L(ExitZero)
58# endif
59 mov %rsi, %rcx
60# ifndef USE_AS_STPCPY
61 mov %rdi, %rax /* save result */
62# endif
63
64# endif
65
66 vpxor %xmmZ, %xmmZ, %xmmZ
67
68 and $((VEC_SIZE * 4) - 1), %ecx
69 cmp $(VEC_SIZE * 2), %ecx
70 jbe L(SourceStringAlignmentLessTwoVecSize)
71
72 and $-VEC_SIZE, %rsi
73 and $(VEC_SIZE - 1), %ecx
74
75 vpcmpeqb (%rsi), %ymmZ, %ymmM
76 vpmovmskb %ymmM, %edx
77 shr %cl, %rdx
78
79# ifdef USE_AS_STRNCPY
80# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
81 mov $VEC_SIZE, %r10
82 sub %rcx, %r10
83 cmp %r10, %r8
84# else
85 mov $(VEC_SIZE + 1), %r10
86 sub %rcx, %r10
87 cmp %r10, %r8
88# endif
89 jbe L(CopyVecSizeTailCase2OrCase3)
90# endif
91 test %edx, %edx
92 jnz L(CopyVecSizeTail)
93
94 vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
95 vpmovmskb %ymm2, %edx
96
97# ifdef USE_AS_STRNCPY
98 add $VEC_SIZE, %r10
99 cmp %r10, %r8
100 jbe L(CopyTwoVecSizeCase2OrCase3)
101# endif
102 test %edx, %edx
103 jnz L(CopyTwoVecSize)
104
105 vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
106 vmovdqu %ymm2, (%rdi)
107
108/* If source address alignment != destination address alignment */
109 .p2align 4
110L(UnalignVecSizeBoth):
111 sub %rcx, %rdi
112# ifdef USE_AS_STRNCPY
113 add %rcx, %r8
114 sbb %rcx, %rcx
115 or %rcx, %r8
116# endif
117 mov $VEC_SIZE, %rcx
118 vmovdqa (%rsi, %rcx), %ymm2
119 vmovdqu %ymm2, (%rdi, %rcx)
120 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
121 vpcmpeqb %ymm2, %ymmZ, %ymmM
122 vpmovmskb %ymmM, %edx
123 add $VEC_SIZE, %rcx
124# ifdef USE_AS_STRNCPY
125 sub $(VEC_SIZE * 3), %r8
126 jbe L(CopyVecSizeCase2OrCase3)
127# endif
128 test %edx, %edx
129# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
130 jnz L(CopyVecSizeUnalignedVec2)
131# else
132 jnz L(CopyVecSize)
133# endif
134
135 vmovdqu %ymm2, (%rdi, %rcx)
136 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
137 vpcmpeqb %ymm3, %ymmZ, %ymmM
138 vpmovmskb %ymmM, %edx
139 add $VEC_SIZE, %rcx
140# ifdef USE_AS_STRNCPY
141 sub $VEC_SIZE, %r8
142 jbe L(CopyVecSizeCase2OrCase3)
143# endif
144 test %edx, %edx
145# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
146 jnz L(CopyVecSizeUnalignedVec3)
147# else
148 jnz L(CopyVecSize)
149# endif
150
151 vmovdqu %ymm3, (%rdi, %rcx)
152 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
153 vpcmpeqb %ymm4, %ymmZ, %ymmM
154 vpmovmskb %ymmM, %edx
155 add $VEC_SIZE, %rcx
156# ifdef USE_AS_STRNCPY
157 sub $VEC_SIZE, %r8
158 jbe L(CopyVecSizeCase2OrCase3)
159# endif
160 test %edx, %edx
161# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
162 jnz L(CopyVecSizeUnalignedVec4)
163# else
164 jnz L(CopyVecSize)
165# endif
166
167 vmovdqu %ymm4, (%rdi, %rcx)
168 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
169 vpcmpeqb %ymm2, %ymmZ, %ymmM
170 vpmovmskb %ymmM, %edx
171 add $VEC_SIZE, %rcx
172# ifdef USE_AS_STRNCPY
173 sub $VEC_SIZE, %r8
174 jbe L(CopyVecSizeCase2OrCase3)
175# endif
176 test %edx, %edx
177# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
178 jnz L(CopyVecSizeUnalignedVec2)
179# else
180 jnz L(CopyVecSize)
181# endif
182
183 vmovdqu %ymm2, (%rdi, %rcx)
184 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
185 vpcmpeqb %ymm2, %ymmZ, %ymmM
186 vpmovmskb %ymmM, %edx
187 add $VEC_SIZE, %rcx
188# ifdef USE_AS_STRNCPY
189 sub $VEC_SIZE, %r8
190 jbe L(CopyVecSizeCase2OrCase3)
191# endif
192 test %edx, %edx
193# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
194 jnz L(CopyVecSizeUnalignedVec2)
195# else
196 jnz L(CopyVecSize)
197# endif
198
199 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
200 vmovdqu %ymm2, (%rdi, %rcx)
201 vpcmpeqb %ymm3, %ymmZ, %ymmM
202 vpmovmskb %ymmM, %edx
203 add $VEC_SIZE, %rcx
204# ifdef USE_AS_STRNCPY
205 sub $VEC_SIZE, %r8
206 jbe L(CopyVecSizeCase2OrCase3)
207# endif
208 test %edx, %edx
209# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
210 jnz L(CopyVecSizeUnalignedVec3)
211# else
212 jnz L(CopyVecSize)
213# endif
214
215 vmovdqu %ymm3, (%rdi, %rcx)
216 mov %rsi, %rdx
217 lea VEC_SIZE(%rsi, %rcx), %rsi
218 and $-(VEC_SIZE * 4), %rsi
219 sub %rsi, %rdx
220 sub %rdx, %rdi
221# ifdef USE_AS_STRNCPY
222 lea (VEC_SIZE * 8)(%r8, %rdx), %r8
223# endif
224L(UnalignedFourVecSizeLoop):
225 vmovdqa (%rsi), %ymm4
226 vmovdqa VEC_SIZE(%rsi), %ymm5
227 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
228 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
229 vpminub %ymm5, %ymm4, %ymm2
230 vpminub %ymm7, %ymm6, %ymm3
231 vpminub %ymm2, %ymm3, %ymm3
232 vpcmpeqb %ymmM, %ymm3, %ymm3
233 vpmovmskb %ymm3, %edx
234# ifdef USE_AS_STRNCPY
235 sub $(VEC_SIZE * 4), %r8
236 jbe L(UnalignedLeaveCase2OrCase3)
237# endif
238 test %edx, %edx
239 jnz L(UnalignedFourVecSizeLeave)
240
241L(UnalignedFourVecSizeLoop_start):
242 add $(VEC_SIZE * 4), %rdi
243 add $(VEC_SIZE * 4), %rsi
244 vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
245 vmovdqa (%rsi), %ymm4
246 vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
247 vmovdqa VEC_SIZE(%rsi), %ymm5
248 vpminub %ymm5, %ymm4, %ymm2
249 vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
250 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
251 vmovdqu %ymm7, -VEC_SIZE(%rdi)
252 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
253 vpminub %ymm7, %ymm6, %ymm3
254 vpminub %ymm2, %ymm3, %ymm3
255 vpcmpeqb %ymmM, %ymm3, %ymm3
256 vpmovmskb %ymm3, %edx
257# ifdef USE_AS_STRNCPY
258 sub $(VEC_SIZE * 4), %r8
259 jbe L(UnalignedLeaveCase2OrCase3)
260# endif
261 test %edx, %edx
262 jz L(UnalignedFourVecSizeLoop_start)
263
264L(UnalignedFourVecSizeLeave):
265 vpcmpeqb %ymm4, %ymmZ, %ymmM
266 vpmovmskb %ymmM, %edx
267 test %edx, %edx
268 jnz L(CopyVecSizeUnaligned_0)
269
270 vpcmpeqb %ymm5, %ymmZ, %ymmM
271 vpmovmskb %ymmM, %ecx
272 test %ecx, %ecx
273 jnz L(CopyVecSizeUnaligned_16)
274
275 vpcmpeqb %ymm6, %ymmZ, %ymmM
276 vpmovmskb %ymmM, %edx
277 test %edx, %edx
278 jnz L(CopyVecSizeUnaligned_32)
279
280 vpcmpeqb %ymm7, %ymmZ, %ymmM
281 vpmovmskb %ymmM, %ecx
282 bsf %ecx, %edx
283 vmovdqu %ymm4, (%rdi)
284 vmovdqu %ymm5, VEC_SIZE(%rdi)
285 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
286# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
287# ifdef USE_AS_STPCPY
288 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
289# endif
290 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
291 add $(VEC_SIZE - 1), %r8
292 sub %rdx, %r8
293 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
294 jmp L(StrncpyFillTailWithZero)
295# else
296 add $(VEC_SIZE * 3), %rsi
297 add $(VEC_SIZE * 3), %rdi
298 jmp L(CopyVecSizeExit)
299# endif
300
301/* If source address alignment == destination address alignment */
302
303L(SourceStringAlignmentLessTwoVecSize):
304 vmovdqu (%rsi), %ymm3
305 vmovdqu VEC_SIZE(%rsi), %ymm2
306 vpcmpeqb %ymm3, %ymmZ, %ymmM
307 vpmovmskb %ymmM, %edx
308
309# ifdef USE_AS_STRNCPY
310# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
311 cmp $VEC_SIZE, %r8
312# else
313 cmp $(VEC_SIZE + 1), %r8
314# endif
315 jbe L(CopyVecSizeTail1Case2OrCase3)
316# endif
317 test %edx, %edx
318 jnz L(CopyVecSizeTail1)
319
320 vmovdqu %ymm3, (%rdi)
321 vpcmpeqb %ymm2, %ymmZ, %ymmM
322 vpmovmskb %ymmM, %edx
323
324# ifdef USE_AS_STRNCPY
325# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
326 cmp $(VEC_SIZE * 2), %r8
327# else
328 cmp $((VEC_SIZE * 2) + 1), %r8
329# endif
330 jbe L(CopyTwoVecSize1Case2OrCase3)
331# endif
332 test %edx, %edx
333 jnz L(CopyTwoVecSize1)
334
335 and $-VEC_SIZE, %rsi
336 and $(VEC_SIZE - 1), %ecx
337 jmp L(UnalignVecSizeBoth)
338
339/*------End of main part with loops---------------------*/
340
341/* Case1 */
342
343# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
344 .p2align 4
345L(CopyVecSize):
346 add %rcx, %rdi
347# endif
348L(CopyVecSizeTail):
349 add %rcx, %rsi
350L(CopyVecSizeTail1):
351 bsf %edx, %edx
352L(CopyVecSizeExit):
353 cmp $32, %edx
354 jae L(Exit32_63)
355 cmp $16, %edx
356 jae L(Exit16_31)
357 cmp $8, %edx
358 jae L(Exit8_15)
359 cmp $4, %edx
360 jae L(Exit4_7)
361 cmp $3, %edx
362 je L(Exit3)
363 cmp $1, %edx
364 ja L(Exit2)
365 je L(Exit1)
366 movb $0, (%rdi)
367# ifdef USE_AS_STPCPY
368 lea (%rdi), %rax
369# endif
370# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
371 sub $1, %r8
372 lea 1(%rdi), %rdi
373 jnz L(StrncpyFillTailWithZero)
374# endif
375L(return_vzeroupper):
376 ZERO_UPPER_VEC_REGISTERS_RETURN
377
378 .p2align 4
379L(CopyTwoVecSize1):
380 add $VEC_SIZE, %rsi
381 add $VEC_SIZE, %rdi
382# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
383 sub $VEC_SIZE, %r8
384# endif
385 jmp L(CopyVecSizeTail1)
386
387 .p2align 4
388L(CopyTwoVecSize):
389 bsf %edx, %edx
390 add %rcx, %rsi
391 add $VEC_SIZE, %edx
392 sub %ecx, %edx
393 jmp L(CopyVecSizeExit)
394
395 .p2align 4
396L(CopyVecSizeUnaligned_0):
397 bsf %edx, %edx
398# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
399# ifdef USE_AS_STPCPY
400 lea (%rdi, %rdx), %rax
401# endif
402 vmovdqu %ymm4, (%rdi)
403 add $((VEC_SIZE * 4) - 1), %r8
404 sub %rdx, %r8
405 lea 1(%rdi, %rdx), %rdi
406 jmp L(StrncpyFillTailWithZero)
407# else
408 jmp L(CopyVecSizeExit)
409# endif
410
411 .p2align 4
412L(CopyVecSizeUnaligned_16):
413 bsf %ecx, %edx
414 vmovdqu %ymm4, (%rdi)
415# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
416# ifdef USE_AS_STPCPY
417 lea VEC_SIZE(%rdi, %rdx), %rax
418# endif
419 vmovdqu %ymm5, VEC_SIZE(%rdi)
420 add $((VEC_SIZE * 3) - 1), %r8
421 sub %rdx, %r8
422 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
423 jmp L(StrncpyFillTailWithZero)
424# else
425 add $VEC_SIZE, %rsi
426 add $VEC_SIZE, %rdi
427 jmp L(CopyVecSizeExit)
428# endif
429
430 .p2align 4
431L(CopyVecSizeUnaligned_32):
432 bsf %edx, %edx
433 vmovdqu %ymm4, (%rdi)
434 vmovdqu %ymm5, VEC_SIZE(%rdi)
435# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
436# ifdef USE_AS_STPCPY
437 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
438# endif
439 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
440 add $((VEC_SIZE * 2) - 1), %r8
441 sub %rdx, %r8
442 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
443 jmp L(StrncpyFillTailWithZero)
444# else
445 add $(VEC_SIZE * 2), %rsi
446 add $(VEC_SIZE * 2), %rdi
447 jmp L(CopyVecSizeExit)
448# endif
449
450# ifdef USE_AS_STRNCPY
451# ifndef USE_AS_STRCAT
452 .p2align 4
453L(CopyVecSizeUnalignedVec6):
454 vmovdqu %ymm6, (%rdi, %rcx)
455 jmp L(CopyVecSizeVecExit)
456
457 .p2align 4
458L(CopyVecSizeUnalignedVec5):
459 vmovdqu %ymm5, (%rdi, %rcx)
460 jmp L(CopyVecSizeVecExit)
461
462 .p2align 4
463L(CopyVecSizeUnalignedVec4):
464 vmovdqu %ymm4, (%rdi, %rcx)
465 jmp L(CopyVecSizeVecExit)
466
467 .p2align 4
468L(CopyVecSizeUnalignedVec3):
469 vmovdqu %ymm3, (%rdi, %rcx)
470 jmp L(CopyVecSizeVecExit)
471# endif
472
473/* Case2 */
474
475 .p2align 4
476L(CopyVecSizeCase2):
477 add $VEC_SIZE, %r8
478 add %rcx, %rdi
479 add %rcx, %rsi
480 bsf %edx, %edx
481 cmp %r8d, %edx
482 jb L(CopyVecSizeExit)
483 jmp L(StrncpyExit)
484
485 .p2align 4
486L(CopyTwoVecSizeCase2):
487 add %rcx, %rsi
488 bsf %edx, %edx
489 add $VEC_SIZE, %edx
490 sub %ecx, %edx
491 cmp %r8d, %edx
492 jb L(CopyVecSizeExit)
493 jmp L(StrncpyExit)
494
495L(CopyVecSizeTailCase2):
496 add %rcx, %rsi
497 bsf %edx, %edx
498 cmp %r8d, %edx
499 jb L(CopyVecSizeExit)
500 jmp L(StrncpyExit)
501
502L(CopyVecSizeTail1Case2):
503 bsf %edx, %edx
504 cmp %r8d, %edx
505 jb L(CopyVecSizeExit)
506 jmp L(StrncpyExit)
507
508/* Case2 or Case3, Case3 */
509
510 .p2align 4
511L(CopyVecSizeCase2OrCase3):
512 test %rdx, %rdx
513 jnz L(CopyVecSizeCase2)
514L(CopyVecSizeCase3):
515 add $VEC_SIZE, %r8
516 add %rcx, %rdi
517 add %rcx, %rsi
518 jmp L(StrncpyExit)
519
520 .p2align 4
521L(CopyTwoVecSizeCase2OrCase3):
522 test %rdx, %rdx
523 jnz L(CopyTwoVecSizeCase2)
524 add %rcx, %rsi
525 jmp L(StrncpyExit)
526
527 .p2align 4
528L(CopyVecSizeTailCase2OrCase3):
529 test %rdx, %rdx
530 jnz L(CopyVecSizeTailCase2)
531 add %rcx, %rsi
532 jmp L(StrncpyExit)
533
534 .p2align 4
535L(CopyTwoVecSize1Case2OrCase3):
536 add $VEC_SIZE, %rdi
537 add $VEC_SIZE, %rsi
538 sub $VEC_SIZE, %r8
539L(CopyVecSizeTail1Case2OrCase3):
540 test %rdx, %rdx
541 jnz L(CopyVecSizeTail1Case2)
542 jmp L(StrncpyExit)
543# endif
544
545/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
546
547 .p2align 4
548L(Exit1):
549 movzwl (%rsi), %edx
550 mov %dx, (%rdi)
551# ifdef USE_AS_STPCPY
552 lea 1(%rdi), %rax
553# endif
554# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
555 sub $2, %r8
556 lea 2(%rdi), %rdi
557 jnz L(StrncpyFillTailWithZero)
558# endif
559 VZEROUPPER_RETURN
560
561 .p2align 4
562L(Exit2):
563 movzwl (%rsi), %ecx
564 mov %cx, (%rdi)
565 movb $0, 2(%rdi)
566# ifdef USE_AS_STPCPY
567 lea 2(%rdi), %rax
568# endif
569# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
570 sub $3, %r8
571 lea 3(%rdi), %rdi
572 jnz L(StrncpyFillTailWithZero)
573# endif
574 VZEROUPPER_RETURN
575
576 .p2align 4
577L(Exit3):
578 mov (%rsi), %edx
579 mov %edx, (%rdi)
580# ifdef USE_AS_STPCPY
581 lea 3(%rdi), %rax
582# endif
583# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
584 sub $4, %r8
585 lea 4(%rdi), %rdi
586 jnz L(StrncpyFillTailWithZero)
587# endif
588 VZEROUPPER_RETURN
589
590 .p2align 4
591L(Exit4_7):
592 mov (%rsi), %ecx
593 mov %ecx, (%rdi)
594 mov -3(%rsi, %rdx), %ecx
595 mov %ecx, -3(%rdi, %rdx)
596# ifdef USE_AS_STPCPY
597 lea (%rdi, %rdx), %rax
598# endif
599# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
600 sub %rdx, %r8
601 sub $1, %r8
602 lea 1(%rdi, %rdx), %rdi
603 jnz L(StrncpyFillTailWithZero)
604# endif
605 VZEROUPPER_RETURN
606
607 .p2align 4
608L(Exit8_15):
609 mov (%rsi), %rcx
610 mov -7(%rsi, %rdx), %r9
611 mov %rcx, (%rdi)
612 mov %r9, -7(%rdi, %rdx)
613# ifdef USE_AS_STPCPY
614 lea (%rdi, %rdx), %rax
615# endif
616# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
617 sub %rdx, %r8
618 sub $1, %r8
619 lea 1(%rdi, %rdx), %rdi
620 jnz L(StrncpyFillTailWithZero)
621# endif
622 VZEROUPPER_RETURN
623
624 .p2align 4
625L(Exit16_31):
626 vmovdqu (%rsi), %xmm2
627 vmovdqu -15(%rsi, %rdx), %xmm3
628 vmovdqu %xmm2, (%rdi)
629 vmovdqu %xmm3, -15(%rdi, %rdx)
630# ifdef USE_AS_STPCPY
631 lea (%rdi, %rdx), %rax
632# endif
633# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
634 sub %rdx, %r8
635 sub $1, %r8
636 lea 1(%rdi, %rdx), %rdi
637 jnz L(StrncpyFillTailWithZero)
638# endif
639 VZEROUPPER_RETURN
640
641 .p2align 4
642L(Exit32_63):
643 vmovdqu (%rsi), %ymm2
644 vmovdqu -31(%rsi, %rdx), %ymm3
645 vmovdqu %ymm2, (%rdi)
646 vmovdqu %ymm3, -31(%rdi, %rdx)
647# ifdef USE_AS_STPCPY
648 lea (%rdi, %rdx), %rax
649# endif
650# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
651 sub %rdx, %r8
652 sub $1, %r8
653 lea 1(%rdi, %rdx), %rdi
654 jnz L(StrncpyFillTailWithZero)
655# endif
656 VZEROUPPER_RETURN
657
658# ifdef USE_AS_STRNCPY
659
660 .p2align 4
661L(StrncpyExit1):
662 movzbl (%rsi), %edx
663 mov %dl, (%rdi)
664# ifdef USE_AS_STPCPY
665 lea 1(%rdi), %rax
666# endif
667# ifdef USE_AS_STRCAT
668 movb $0, 1(%rdi)
669# endif
670 VZEROUPPER_RETURN
671
672 .p2align 4
673L(StrncpyExit2):
674 movzwl (%rsi), %edx
675 mov %dx, (%rdi)
676# ifdef USE_AS_STPCPY
677 lea 2(%rdi), %rax
678# endif
679# ifdef USE_AS_STRCAT
680 movb $0, 2(%rdi)
681# endif
682 VZEROUPPER_RETURN
683
684 .p2align 4
685L(StrncpyExit3_4):
686 movzwl (%rsi), %ecx
687 movzwl -2(%rsi, %r8), %edx
688 mov %cx, (%rdi)
689 mov %dx, -2(%rdi, %r8)
690# ifdef USE_AS_STPCPY
691 lea (%rdi, %r8), %rax
692# endif
693# ifdef USE_AS_STRCAT
694 movb $0, (%rdi, %r8)
695# endif
696 VZEROUPPER_RETURN
697
698 .p2align 4
699L(StrncpyExit5_8):
700 mov (%rsi), %ecx
701 mov -4(%rsi, %r8), %edx
702 mov %ecx, (%rdi)
703 mov %edx, -4(%rdi, %r8)
704# ifdef USE_AS_STPCPY
705 lea (%rdi, %r8), %rax
706# endif
707# ifdef USE_AS_STRCAT
708 movb $0, (%rdi, %r8)
709# endif
710 VZEROUPPER_RETURN
711
712 .p2align 4
713L(StrncpyExit9_16):
714 mov (%rsi), %rcx
715 mov -8(%rsi, %r8), %rdx
716 mov %rcx, (%rdi)
717 mov %rdx, -8(%rdi, %r8)
718# ifdef USE_AS_STPCPY
719 lea (%rdi, %r8), %rax
720# endif
721# ifdef USE_AS_STRCAT
722 movb $0, (%rdi, %r8)
723# endif
724 VZEROUPPER_RETURN
725
726 .p2align 4
727L(StrncpyExit17_32):
728 vmovdqu (%rsi), %xmm2
729 vmovdqu -16(%rsi, %r8), %xmm3
730 vmovdqu %xmm2, (%rdi)
731 vmovdqu %xmm3, -16(%rdi, %r8)
732# ifdef USE_AS_STPCPY
733 lea (%rdi, %r8), %rax
734# endif
735# ifdef USE_AS_STRCAT
736 movb $0, (%rdi, %r8)
737# endif
738 VZEROUPPER_RETURN
739
740 .p2align 4
741L(StrncpyExit33_64):
742 /* 0/32, 31/16 */
743 vmovdqu (%rsi), %ymm2
744 vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
745 vmovdqu %ymm2, (%rdi)
746 vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
747# ifdef USE_AS_STPCPY
748 lea (%rdi, %r8), %rax
749# endif
750# ifdef USE_AS_STRCAT
751 movb $0, (%rdi, %r8)
752# endif
753 VZEROUPPER_RETURN
754
755 .p2align 4
756L(StrncpyExit65):
757 /* 0/32, 32/32, 64/1 */
758 vmovdqu (%rsi), %ymm2
759 vmovdqu 32(%rsi), %ymm3
760 mov 64(%rsi), %cl
761 vmovdqu %ymm2, (%rdi)
762 vmovdqu %ymm3, 32(%rdi)
763 mov %cl, 64(%rdi)
764# ifdef USE_AS_STPCPY
765 lea 65(%rdi), %rax
766# endif
767# ifdef USE_AS_STRCAT
768 movb $0, 65(%rdi)
769# endif
770 VZEROUPPER_RETURN
771
772# ifndef USE_AS_STRCAT
773
774 .p2align 4
775L(Fill1):
776 mov %dl, (%rdi)
777 VZEROUPPER_RETURN
778
779 .p2align 4
780L(Fill2):
781 mov %dx, (%rdi)
782 VZEROUPPER_RETURN
783
784 .p2align 4
785L(Fill3_4):
786 mov %dx, (%rdi)
787 mov %dx, -2(%rdi, %r8)
788 VZEROUPPER_RETURN
789
790 .p2align 4
791L(Fill5_8):
792 mov %edx, (%rdi)
793 mov %edx, -4(%rdi, %r8)
794 VZEROUPPER_RETURN
795
796 .p2align 4
797L(Fill9_16):
798 mov %rdx, (%rdi)
799 mov %rdx, -8(%rdi, %r8)
800 VZEROUPPER_RETURN
801
802 .p2align 4
803L(Fill17_32):
804 vmovdqu %xmmZ, (%rdi)
805 vmovdqu %xmmZ, -16(%rdi, %r8)
806 VZEROUPPER_RETURN
807
808 .p2align 4
809L(CopyVecSizeUnalignedVec2):
810 vmovdqu %ymm2, (%rdi, %rcx)
811
812 .p2align 4
813L(CopyVecSizeVecExit):
814 bsf %edx, %edx
815 add $(VEC_SIZE - 1), %r8
816 add %rcx, %rdi
817# ifdef USE_AS_STPCPY
818 lea (%rdi, %rdx), %rax
819# endif
820 sub %rdx, %r8
821 lea 1(%rdi, %rdx), %rdi
822
823 .p2align 4
824L(StrncpyFillTailWithZero):
825 xor %edx, %edx
826 sub $VEC_SIZE, %r8
827 jbe L(StrncpyFillExit)
828
829 vmovdqu %ymmZ, (%rdi)
830 add $VEC_SIZE, %rdi
831
832 mov %rdi, %rsi
833 and $(VEC_SIZE - 1), %esi
834 sub %rsi, %rdi
835 add %rsi, %r8
836 sub $(VEC_SIZE * 4), %r8
837 jb L(StrncpyFillLessFourVecSize)
838
839L(StrncpyFillLoopVmovdqa):
840 vmovdqa %ymmZ, (%rdi)
841 vmovdqa %ymmZ, VEC_SIZE(%rdi)
842 vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
843 vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
844 add $(VEC_SIZE * 4), %rdi
845 sub $(VEC_SIZE * 4), %r8
846 jae L(StrncpyFillLoopVmovdqa)
847
848L(StrncpyFillLessFourVecSize):
849 add $(VEC_SIZE * 2), %r8
850 jl L(StrncpyFillLessTwoVecSize)
851 vmovdqa %ymmZ, (%rdi)
852 vmovdqa %ymmZ, VEC_SIZE(%rdi)
853 add $(VEC_SIZE * 2), %rdi
854 sub $VEC_SIZE, %r8
855 jl L(StrncpyFillExit)
856 vmovdqa %ymmZ, (%rdi)
857 add $VEC_SIZE, %rdi
858 jmp L(Fill)
859
860 .p2align 4
861L(StrncpyFillLessTwoVecSize):
862 add $VEC_SIZE, %r8
863 jl L(StrncpyFillExit)
864 vmovdqa %ymmZ, (%rdi)
865 add $VEC_SIZE, %rdi
866 jmp L(Fill)
867
868 .p2align 4
869L(StrncpyFillExit):
870 add $VEC_SIZE, %r8
871L(Fill):
872 cmp $17, %r8d
873 jae L(Fill17_32)
874 cmp $9, %r8d
875 jae L(Fill9_16)
876 cmp $5, %r8d
877 jae L(Fill5_8)
878 cmp $3, %r8d
879 jae L(Fill3_4)
880 cmp $1, %r8d
881 ja L(Fill2)
882 je L(Fill1)
883 VZEROUPPER_RETURN
884
885/* end of ifndef USE_AS_STRCAT */
886# endif
887
888 .p2align 4
889L(UnalignedLeaveCase2OrCase3):
890 test %rdx, %rdx
891 jnz L(UnalignedFourVecSizeLeaveCase2)
892L(UnalignedFourVecSizeLeaveCase3):
893 lea (VEC_SIZE * 4)(%r8), %rcx
894 and $-VEC_SIZE, %rcx
895 add $(VEC_SIZE * 3), %r8
896 jl L(CopyVecSizeCase3)
897 vmovdqu %ymm4, (%rdi)
898 sub $VEC_SIZE, %r8
899 jb L(CopyVecSizeCase3)
900 vmovdqu %ymm5, VEC_SIZE(%rdi)
901 sub $VEC_SIZE, %r8
902 jb L(CopyVecSizeCase3)
903 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
904 sub $VEC_SIZE, %r8
905 jb L(CopyVecSizeCase3)
906 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
907# ifdef USE_AS_STPCPY
908 lea (VEC_SIZE * 4)(%rdi), %rax
909# endif
910# ifdef USE_AS_STRCAT
911 movb $0, (VEC_SIZE * 4)(%rdi)
912# endif
913 VZEROUPPER_RETURN
914
915 .p2align 4
916L(UnalignedFourVecSizeLeaveCase2):
917 xor %ecx, %ecx
918 vpcmpeqb %ymm4, %ymmZ, %ymmM
919 vpmovmskb %ymmM, %edx
920 add $(VEC_SIZE * 3), %r8
921 jle L(CopyVecSizeCase2OrCase3)
922 test %edx, %edx
923# ifndef USE_AS_STRCAT
924 jnz L(CopyVecSizeUnalignedVec4)
925# else
926 jnz L(CopyVecSize)
927# endif
928 vpcmpeqb %ymm5, %ymmZ, %ymmM
929 vpmovmskb %ymmM, %edx
930 vmovdqu %ymm4, (%rdi)
931 add $VEC_SIZE, %rcx
932 sub $VEC_SIZE, %r8
933 jbe L(CopyVecSizeCase2OrCase3)
934 test %edx, %edx
935# ifndef USE_AS_STRCAT
936 jnz L(CopyVecSizeUnalignedVec5)
937# else
938 jnz L(CopyVecSize)
939# endif
940
941 vpcmpeqb %ymm6, %ymmZ, %ymmM
942 vpmovmskb %ymmM, %edx
943 vmovdqu %ymm5, VEC_SIZE(%rdi)
944 add $VEC_SIZE, %rcx
945 sub $VEC_SIZE, %r8
946 jbe L(CopyVecSizeCase2OrCase3)
947 test %edx, %edx
948# ifndef USE_AS_STRCAT
949 jnz L(CopyVecSizeUnalignedVec6)
950# else
951 jnz L(CopyVecSize)
952# endif
953
954 vpcmpeqb %ymm7, %ymmZ, %ymmM
955 vpmovmskb %ymmM, %edx
956 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
957 lea VEC_SIZE(%rdi, %rcx), %rdi
958 lea VEC_SIZE(%rsi, %rcx), %rsi
959 bsf %edx, %edx
960 cmp %r8d, %edx
961 jb L(CopyVecSizeExit)
962L(StrncpyExit):
963 cmp $65, %r8d
964 je L(StrncpyExit65)
965 cmp $33, %r8d
966 jae L(StrncpyExit33_64)
967 cmp $17, %r8d
968 jae L(StrncpyExit17_32)
969 cmp $9, %r8d
970 jae L(StrncpyExit9_16)
971 cmp $5, %r8d
972 jae L(StrncpyExit5_8)
973 cmp $3, %r8d
974 jae L(StrncpyExit3_4)
975 cmp $1, %r8d
976 ja L(StrncpyExit2)
977 je L(StrncpyExit1)
978# ifdef USE_AS_STPCPY
979 mov %rdi, %rax
980# endif
981# ifdef USE_AS_STRCAT
982 movb $0, (%rdi)
983# endif
984 VZEROUPPER_RETURN
985
986 .p2align 4
987L(ExitZero):
988# ifndef USE_AS_STRCAT
989 mov %rdi, %rax
990# endif
991 VZEROUPPER_RETURN
992
993# endif
994
995# ifndef USE_AS_STRCAT
996END (STRCPY)
997# else
998END (STRCAT)
999# endif
1000#endif
1001

source code of glibc/sysdeps/x86_64/multiarch/strcpy-avx2.S