1/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23 /* Use evex-masked stores for small sizes. Turned off at the
24 moment. */
25# define USE_EVEX_MASKED_STORE 0
26
27
28# include <sysdep.h>
29# ifndef VEC_SIZE
30# include "x86-evex256-vecs.h"
31# endif
32
33
34# ifndef STRNCPY
35# define STRNCPY __strncpy_evex
36# endif
37
38# ifdef USE_AS_WCSCPY
39# define VMOVU_MASK vmovdqu32
40# define VPCMPEQ vpcmpeqd
41# define VPMIN vpminud
42# define VPTESTN vptestnmd
43# define VPTEST vptestmd
44# define CHAR_SIZE 4
45
46# define REP_MOVS rep movsd
47# define REP_STOS rep stosl
48
49# define USE_WIDE_CHAR
50
51# else
52# define VMOVU_MASK vmovdqu8
53# define VPCMPEQ vpcmpeqb
54# define VPMIN vpminub
55# define VPTESTN vptestnmb
56# define VPTEST vptestmb
57# define CHAR_SIZE 1
58
59# define REP_MOVS rep movsb
60# define REP_STOS rep stosb
61# endif
62
63# include "strncpy-or-cat-overflow-def.h"
64
65# define PAGE_SIZE 4096
66# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
67
68# include "reg-macros.h"
69
70
71# define VZERO VMM(7)
72# define VZERO_256 VMM_256(7)
73# define VZERO_128 VMM_128(7)
74
75# if VEC_SIZE == 64
76# define VZERO_HALF VZERO_256
77# else
78# define VZERO_HALF VZERO_128
79# endif
80
81 .section SECTION(.text), "ax", @progbits
82ENTRY(STRNCPY)
83# ifdef __ILP32__
84 /* Clear the upper 32 bits. */
85 movl %edx, %edx
86# endif
87 /* Filter zero length strings and very long strings. Zero
88 length strings just return, very long strings are handled by
89 just running rep stos{b|l} to zero set (which will almost
90 certainly segfault), if that succeeds then just calling
91 OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
92# ifdef USE_AS_WCSCPY
93 decq %rdx
94 movq %rdx, %rax
95 /* 56 is end of max supported address space. */
96 shr $56, %rax
97 jnz L(zero_len)
98# else
99 decq %rdx
100 /* If the flag needs to become `jb` replace `dec` with `sub`.
101 */
102 jl L(zero_len)
103# endif
104
105 vpxorq %VZERO_128, %VZERO_128, %VZERO_128
106 movl %esi, %eax
107 andl $(PAGE_SIZE - 1), %eax
108 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
109 ja L(page_cross)
110
111L(page_cross_continue):
112 VMOVU (%rsi), %VMM(0)
113 VPTESTN %VMM(0), %VMM(0), %k0
114 KMOV %k0, %VRCX
115
116 /* If no STPCPY just save end ahead of time. */
117# ifndef USE_AS_STPCPY
118 movq %rdi, %rax
119# endif
120
121
122 cmpq $(CHAR_PER_VEC), %rdx
123
124 /* If USE_EVEX_MASK_STORE is enabled then we just handle length
125 <= CHAR_PER_VEC with masked instructions (which have
126 potential for dramatically bad perf if dst splits a page and
127 is not in the TLB). */
128# if USE_EVEX_MASKED_STORE
129 /* `jae` because length rdx is now length - 1. */
130 jae L(more_1x_vec)
131
132 /* If there where multiple zero-CHAR matches in the first VEC,
133 VRCX will be overset but that's fine since any oversets where
134 at zero-positions anyways. */
135
136# ifdef USE_AS_STPCPY
137 tzcnt %VRCX, %VRAX
138 cmpl %eax, %edx
139 cmovb %edx, %eax
140# ifdef USE_AS_WCSCPY
141 adcl $0, %eax
142 leaq (%rdi, %rax, CHAR_SIZE), %rax
143# else
144 adcq %rdi, %rax
145# endif
146# endif
147 dec %VRCX
148
149 /* Zero out all non-zero CHAR's after the first zero match. */
150 KMOV %VRCX, %k1
151
152 /* Use VZERO as destination so this can be reused for
153 L(zfill_less_vec) (which if jumped to by subsequent logic
154 will have zerod out VZERO. */
155 VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
156L(zfill_less_vec):
157 /* Get mask for what we need to set. */
158 incl %edx
159 mov $-1, %VRCX
160 bzhi %VRDX, %VRCX, %VRCX
161 KMOV %VRCX, %k1
162 VMOVU_MASK %VZERO, (%rdi){%k1}
163 ret
164
165 .p2align 4,, 4
166L(zero_len):
167 cmpq $-1, %rdx
168 jne L(best_effort_strncpy)
169 movq %rdi, %rax
170 ret
171
172 .p2align 4,, 8
173L(more_1x_vec):
174# else
175 /* `jb` because length rdx is now length - 1. */
176 jb L(less_1x_vec)
177# endif
178
179
180 /* This may overset but that's fine because we still need to zero
181 fill. */
182 VMOVU %VMM(0), (%rdi)
183
184
185 /* Length must be >= CHAR_PER_VEC so match here means we must
186 zero-fill. */
187 test %VRCX, %VRCX
188 jnz L(zfill)
189
190
191 /* We are going to align rsi here so will need to be able to re-
192 adjust rdi/rdx afterwards. NB: We filtered out huge lengths
193 so rsi + rdx * CHAR_SIZE cannot overflow. */
194 leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
195 subq %rsi, %rdi
196 andq $-(VEC_SIZE), %rsi
197
198L(loop_last_4x_vec):
199 addq %rsi, %rdi
200 subq %rsi, %rdx
201# ifdef USE_AS_WCSCPY
202 shrq $2, %rdx
203# endif
204
205 VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
206 VPTESTN %VMM(1), %VMM(1), %k0
207 KMOV %k0, %VRCX
208
209 /* -1 because of the `dec %rdx` earlier. */
210 cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
211 ja L(more_2x_vec)
212
213L(last_2x_vec):
214 /* This will be need to be computed no matter what. We do it
215 ahead of time for CHAR_PER_VEC == 64 because we can't adjust
216 the value of `tzcnt` with a shift. */
217# if CHAR_PER_VEC == 64
218 tzcntq %rcx, %rcx
219# endif
220
221 cmpl $(CHAR_PER_VEC), %edx
222 jb L(ret_vec_x1_len)
223
224 /* Separate logic for CHAR_PER_VEC == 64 because we already did
225 `tzcnt` on VRCX. */
226# if CHAR_PER_VEC == 64
227 /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
228 cmpb $CHAR_PER_VEC, %cl
229 jnz L(ret_vec_x1_no_bsf)
230# else
231 test %VRCX, %VRCX
232 jnz L(ret_vec_x1)
233# endif
234
235
236
237 VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
238 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
239 KMOV %k0, %VRCX
240
241# if CHAR_PER_VEC < 64
242 /* This essentiallys adds CHAR_PER_VEC to computed result. */
243 shlq $CHAR_PER_VEC, %rcx
244# else
245 tzcntq %rcx, %rcx
246 addl $CHAR_PER_VEC, %ecx
247# endif
248
249 .p2align 4,, 4
250L(ret_vec_x1_len):
251 /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
252 already been done. */
253# if CHAR_PER_VEC < 64
254 tzcntq %rcx, %rcx
255# endif
256 cmpl %ecx, %edx
257 jbe L(ret_vec_x1_len_no_zfill)
258 /* Fall through (expectation) is copy len < buffer len. */
259 VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
260L(ret_vec_x1_len_no_zfill_mov):
261 movl %ecx, %edx
262# ifdef USE_AS_STPCPY
263 /* clear flags. */
264 xorl %ecx, %ecx
265# endif
266L(ret_vec_x1_len_no_zfill):
267 VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
268 VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
269# ifdef USE_AS_STPCPY
270# ifdef USE_AS_WCSCPY
271 adcq $0, %rdx
272 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
273# else
274 leal (VEC_SIZE)(%rdx), %eax
275 adcq %rdi, %rax
276# endif
277# endif
278 ret
279
280
281 .p2align 4,, 10
282L(ret_vec_x1):
283 bsf %VRCX, %VRCX
284L(ret_vec_x1_no_bsf):
285 VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
286 subl %ecx, %edx
287 cmpl $CHAR_PER_VEC, %edx
288 jb L(ret_vec_x1_len_no_zfill_mov)
289 /* Fall through (expectation) is copy len < buffer len. */
290 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
291 VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
292# ifdef USE_AS_STPCPY
293 leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
294# endif
295 ret
296
297 .p2align 4,, 8
298L(last_4x_vec):
299 /* Separate logic for CHAR_PER_VEC == 64 because we can do `andl
300 $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
301 using `movzbl`. */
302# if CHAR_PER_VEC == 64
303 movzbl %dl, %edx
304# else
305 andl $(CHAR_PER_VEC * 4 - 1), %edx
306# endif
307 VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
308 VPTESTN %VMM(1), %VMM(1), %k0
309 KMOV %k0, %VRCX
310 subq $-(VEC_SIZE * 4), %rsi
311 subq $-(VEC_SIZE * 4), %rdi
312 cmpl $(CHAR_PER_VEC * 2 - 1), %edx
313 jbe L(last_2x_vec)
314 .p2align 4,, 8
315L(more_2x_vec):
316 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
317 test %VRCX, %VRCX
318 /* Must fill at least 2x VEC. */
319 jnz L(zfill_vec1)
320
321 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
322 VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
323 VPTESTN %VMM(2), %VMM(2), %k0
324 KMOV %k0, %VRCX
325 test %VRCX, %VRCX
326 /* Must fill at least 1x VEC. */
327 jnz L(zfill_vec2)
328
329 VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
330 VPTESTN %VMM(3), %VMM(3), %k0
331 KMOV %k0, %VRCX
332
333 /* Check if len is more 4x VEC. -1 because rdx is len - 1. */
334 cmpq $(CHAR_PER_VEC * 4 - 1), %rdx
335 ja L(more_4x_vec)
336
337 subl $(CHAR_PER_VEC * 3), %edx
338 jb L(ret_vec_x3_len)
339
340 test %VRCX, %VRCX
341 jnz L(ret_vec_x3)
342
343 VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
344 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
345 KMOV %k0, %VRCX
346 tzcnt %VRCX, %VRCX
347 cmpl %ecx, %edx
348 jbe L(ret_vec_x4_len_no_zfill)
349 /* Fall through (expectation) is copy len < buffer len. */
350 VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
351 movl %ecx, %edx
352L(ret_vec_x4_len_no_zfill):
353 VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
354 VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
355# ifdef USE_AS_STPCPY
356# ifdef USE_AS_WCSCPY
357 adcq $0, %rdx
358 leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
359# else
360 leal (VEC_SIZE * 4 + 0)(%rdx), %eax
361 adcq %rdi, %rax
362# endif
363# endif
364 ret
365
366
367L(ret_vec_x3_len):
368 addl $(CHAR_PER_VEC * 1), %edx
369 tzcnt %VRCX, %VRCX
370 cmpl %ecx, %edx
371 jbe L(ret_vec_x3_len_no_zfill)
372 /* Fall through (expectation) is copy len < buffer len. */
373 VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
374L(ret_vec_x3_len_no_zfill_mov):
375 movl %ecx, %edx
376# ifdef USE_AS_STPCPY
377 /* clear flags. */
378 xorl %ecx, %ecx
379# endif
380 .p2align 4,, 4
381L(ret_vec_x3_len_no_zfill):
382 VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
383 VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
384# ifdef USE_AS_STPCPY
385# ifdef USE_AS_WCSCPY
386 adcq $0, %rdx
387 leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
388# else
389 leal (VEC_SIZE * 3 + 0)(%rdx), %eax
390 adcq %rdi, %rax
391# endif
392# endif
393 ret
394
395
396 .p2align 4,, 8
397L(ret_vec_x3):
398 bsf %VRCX, %VRCX
399 VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
400 subl %ecx, %edx
401 jl L(ret_vec_x3_len_no_zfill_mov)
402 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
403 VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
404# ifdef USE_AS_STPCPY
405 leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
406# endif
407 ret
408
409 .p2align 4,, 8
410L(more_4x_vec):
411 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
412 test %VRCX, %VRCX
413 jnz L(zfill_vec3)
414
415 VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
416 VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
417 VPTESTN %VMM(4), %VMM(4), %k0
418 KMOV %k0, %VRCX
419 test %VRCX, %VRCX
420 jnz L(zfill_vec4)
421
422 /* Recheck length before aligning. */
423 cmpq $(CHAR_PER_VEC * 8 - 1), %rdx
424 jbe L(last_4x_vec)
425
426 /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */
427# ifdef USE_AS_WCSCPY
428 leaq (%rsi, %rdx, CHAR_SIZE), %rdx
429# else
430 addq %rsi, %rdx
431# endif
432 subq %rsi, %rdi
433 subq $-(VEC_SIZE * 5), %rsi
434 andq $(VEC_SIZE * -4), %rsi
435
436
437 /* Load first half of the loop before entry. */
438 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
439 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
440 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
441 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
442
443 VPMIN %VMM(0), %VMM(1), %VMM(4)
444 VPMIN %VMM(2), %VMM(3), %VMM(6)
445 VPTESTN %VMM(4), %VMM(4), %k2
446 VPTESTN %VMM(6), %VMM(6), %k4
447
448
449 /* Offset rsi by VEC_SIZE so that we can jump to
450 L(loop_last_4x_vec). */
451 addq $-(VEC_SIZE), %rsi
452 KORTEST %k2, %k4
453 jnz L(loop_4x_done)
454
455 /* Store loop end in r9. */
456 leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
457
458 .p2align 4,, 11
459L(loop_4x_vec):
460 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
461 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
462 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
463 VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
464
465 subq $(VEC_SIZE * -4), %rsi
466 cmpq %rsi, %r9
467 jbe L(loop_last_4x_vec)
468
469 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
470 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
471 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
472 VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
473
474 VPMIN %VMM(0), %VMM(1), %VMM(4)
475 VPMIN %VMM(2), %VMM(3), %VMM(6)
476 VPTESTN %VMM(4), %VMM(4), %k2
477 VPTESTN %VMM(6), %VMM(6), %k4
478 KORTEST %k2, %k4
479 jz L(loop_4x_vec)
480
481L(loop_4x_done):
482 /* Restore rdx (length). */
483 subq %rsi, %rdx
484# ifdef USE_AS_WCSCPY
485 shrq $2, %rdx
486# endif
487 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
488 /* Restore rdi (dst). */
489 addq %rsi, %rdi
490 VPTESTN %VMM(0), %VMM(0), %k0
491 KMOV %k0, %VRCX
492 test %VRCX, %VRCX
493 jnz L(zfill_vec1)
494
495 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
496 KMOV %k2, %VRCX
497 test %VRCX, %VRCX
498 jnz L(zfill_vec2)
499
500 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
501 VPTESTN %VMM(2), %VMM(2), %k0
502 KMOV %k0, %VRCX
503 test %VRCX, %VRCX
504 jnz L(zfill_vec3)
505
506 VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
507 KMOV %k4, %VRCX
508 // Zfill more....
509
510 .p2align 4,, 4
511L(zfill_vec4):
512 subq $(VEC_SIZE * -2), %rdi
513 addq $(CHAR_PER_VEC * -2), %rdx
514L(zfill_vec2):
515 subq $(VEC_SIZE * -2), %rdi
516 addq $(CHAR_PER_VEC * -1), %rdx
517L(zfill):
518 /* VRCX must be non-zero. */
519 bsf %VRCX, %VRCX
520
521 /* Adjust length / dst for zfill. */
522 subq %rcx, %rdx
523# ifdef USE_AS_WCSCPY
524 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
525# else
526 addq %rcx, %rdi
527# endif
528# ifdef USE_AS_STPCPY
529 movq %rdi, %rax
530# endif
531L(zfill_from_page_cross):
532
533 /* From here on out its just memset(rdi, 0, rdx). */
534 cmpq $CHAR_PER_VEC, %rdx
535 jb L(zfill_less_vec)
536
537L(zfill_more_1x_vec):
538 VMOVU %VZERO, (%rdi)
539 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
540 cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
541 ja L(zfill_more_2x_vec)
542L(zfill_done0):
543 ret
544
545 /* Coming from vec1/vec2 we must be able to zfill at least 2x
546 VEC. */
547 .p2align 4,, 8
548L(zfill_vec3):
549 subq $(VEC_SIZE * -2), %rdi
550 addq $(CHAR_PER_VEC * -2), %rdx
551 .p2align 4,, 2
552L(zfill_vec1):
553 bsfq %rcx, %rcx
554 /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
555 */
556 leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
557 subq %rcx, %rdx
558# ifdef USE_AS_STPCPY
559 movq %rdi, %rax
560# endif
561
562
563 VMOVU %VZERO, (%rdi)
564 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
565 cmpq $(CHAR_PER_VEC * 2), %rdx
566 jb L(zfill_done0)
567L(zfill_more_2x_vec):
568 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
569 VMOVU %VZERO, (VEC_SIZE)(%rdi)
570 subq $(CHAR_PER_VEC * 4 - 1), %rdx
571 jbe L(zfill_done)
572
573# ifdef USE_AS_WCSCPY
574 leaq (%rdi, %rdx, CHAR_SIZE), %rdx
575# else
576 addq %rdi, %rdx
577# endif
578
579 VMOVU %VZERO, (VEC_SIZE * 2)(%rdi)
580 VMOVU %VZERO, (VEC_SIZE * 3)(%rdi)
581
582
583 VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
584 VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
585
586 subq $-(VEC_SIZE * 4), %rdi
587 cmpq %rdi, %rdx
588 jbe L(zfill_done)
589
590 /* Align rdi and zfill loop. */
591 andq $-(VEC_SIZE), %rdi
592 .p2align 4,, 12
593L(zfill_loop_4x_vec):
594 VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
595 VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
596 VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
597 VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
598 subq $-(VEC_SIZE * 4), %rdi
599 cmpq %rdi, %rdx
600 ja L(zfill_loop_4x_vec)
601L(zfill_done):
602 ret
603
604
605 /* Less 1x VEC case if we are not using evex masked store. */
606# if !USE_EVEX_MASKED_STORE
607 .p2align 4,, 8
608L(copy_1x):
609 /* Special case for copy 1x. It can be handled quickly and many
610 buffer sizes have convenient alignment. */
611 VMOVU %VMM(0), (%rdi)
612 /* If no zeros then we are done. */
613 testl %ecx, %ecx
614 jz L(ret_1x_1x)
615
616 /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
617 only handle the small case here. */
618 bsf %VRCX, %VRCX
619L(zfill_less_vec_no_bsf):
620 /* Adjust length / dst then just zfill less_vec. */
621 subq %rcx, %rdx
622# ifdef USE_AS_WCSCPY
623 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
624# else
625 addq %rcx, %rdi
626# endif
627# ifdef USE_AS_STPCPY
628 movq %rdi, %rax
629# endif
630
631L(zfill_less_vec):
632 cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx
633 jb L(zfill_less_half)
634
635 VMOVU %VZERO_HALF, (%rdi)
636 VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
637 ret
638# ifdef USE_AS_STPCPY
639L(ret_1x_1x):
640 leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
641 ret
642# endif
643
644
645# if VEC_SIZE == 64
646 .p2align 4,, 4
647L(copy_32_63):
648 /* Overfill to avoid branches. */
649 VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
650 VMOVU %VMM_256(0), (%rdi)
651 VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
652
653 /* We are taking advantage of the fact that to be here we must
654 be writing null-term as (%rdi, %rcx) we have a byte of lee-
655 way for overwriting. */
656 cmpl %ecx, %edx
657 ja L(zfill_less_vec_no_bsf)
658# ifndef USE_AS_STPCPY
659L(ret_1x_1x):
660# else
661# ifdef USE_AS_WCSCPY
662 adcq $0, %rdx
663 leaq (%rdi, %rdx, CHAR_SIZE), %rax
664# else
665 movl %edx, %eax
666 adcq %rdi, %rax
667# endif
668# endif
669 ret
670# endif
671
672 .p2align 4,, 4
673L(copy_16_31):
674 /* Overfill to avoid branches. */
675 vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
676 VMOVU %VMM_128(0), (%rdi)
677 vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
678 cmpl %ecx, %edx
679
680 /* Separate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
681 we have a larger copy block for 32-63 so this is just falls
682 through to zfill 16-31. If VEC_SIZE == 32 then we check for
683 full zfill of less 1x VEC. */
684# if VEC_SIZE == 64
685 jbe L(ret_16_31)
686 subl %ecx, %edx
687# ifdef USE_AS_WCSCPY
688 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
689# else
690 addq %rcx, %rdi
691# endif
692# ifdef USE_AS_STPCPY
693 movq %rdi, %rax
694# endif
695L(zfill_less_half):
696L(zfill_less_32):
697 cmpl $(16 / CHAR_SIZE), %edx
698 jb L(zfill_less_16)
699 VMOVU %VZERO_128, (%rdi)
700 VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
701# ifdef USE_AS_STPCPY
702 ret
703# endif
704L(ret_16_31):
705# ifdef USE_AS_STPCPY
706# ifdef USE_AS_WCSCPY
707 adcq $0, %rdx
708 leaq (%rdi, %rdx, CHAR_SIZE), %rax
709# else
710 movl %edx, %eax
711 adcq %rdi, %rax
712# endif
713# endif
714 ret
715# else
716 /* VEC_SIZE == 32 begins. */
717 ja L(zfill_less_vec_no_bsf)
718# ifndef USE_AS_STPCPY
719L(ret_1x_1x):
720# else
721# ifdef USE_AS_WCSCPY
722 adcq $0, %rdx
723 leaq (%rdi, %rdx, CHAR_SIZE), %rax
724# else
725 movl %edx, %eax
726 adcq %rdi, %rax
727# endif
728# endif
729 ret
730# endif
731
732
733 .p2align 4,, 4
734L(copy_8_15):
735 /* Overfill to avoid branches. */
736 movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
737 vmovq %VMM_128(0), (%rdi)
738 movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
739 cmpl %ecx, %edx
740 jbe L(ret_8_15)
741 subl %ecx, %edx
742# ifdef USE_AS_WCSCPY
743 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
744# else
745 addq %rcx, %rdi
746# endif
747# ifdef USE_AS_STPCPY
748 movq %rdi, %rax
749# endif
750 .p2align 4,, 8
751# if VEC_SIZE == 32
752L(zfill_less_half):
753# endif
754L(zfill_less_16):
755 xorl %ecx, %ecx
756 cmpl $(8 / CHAR_SIZE), %edx
757 jb L(zfill_less_8)
758 movq %rcx, (%rdi)
759 movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
760# ifndef USE_AS_STPCPY
761L(ret_8_15):
762# endif
763 ret
764
765 .p2align 4,, 8
766L(less_1x_vec):
767 je L(copy_1x)
768
769 /* We will need `tzcnt` result for all other copy sizes. */
770 tzcnt %VRCX, %VRCX
771# if VEC_SIZE == 64
772 cmpl $(32 / CHAR_SIZE), %edx
773 jae L(copy_32_63)
774# endif
775
776 cmpl $(16 / CHAR_SIZE), %edx
777 jae L(copy_16_31)
778
779 cmpl $(8 / CHAR_SIZE), %edx
780 jae L(copy_8_15)
781# ifdef USE_AS_WCSCPY
782 testl %ecx, %ecx
783 jz L(zfill_less_8_set_ret)
784
785 movl (%rsi, %rdx, CHAR_SIZE), %esi
786 vmovd %VMM_128(0), (%rdi)
787 movl %esi, (%rdi, %rdx, CHAR_SIZE)
788# ifdef USE_AS_STPCPY
789 cmpl %ecx, %edx
790L(ret_8_15):
791 adcq $0, %rdx
792 leaq (%rdi, %rdx, CHAR_SIZE), %rax
793# endif
794 ret
795L(zfill_less_8_set_ret):
796 xorl %ecx, %ecx
797# ifdef USE_AS_STPCPY
798 movq %rdi, %rax
799# endif
800L(zfill_less_8):
801 movl %ecx, (%rdi)
802 movl %ecx, (%rdi, %rdx, CHAR_SIZE)
803 ret
804# else
805 cmpl $3, %edx
806 jb L(copy_0_3)
807 /* Overfill to avoid branches. */
808 movl -3(%rsi, %rdx), %esi
809 vmovd %VMM_128(0), (%rdi)
810 movl %esi, -3(%rdi, %rdx)
811 cmpl %ecx, %edx
812 jbe L(ret_4_7)
813 subq %rcx, %rdx
814 addq %rcx, %rdi
815# ifdef USE_AS_STPCPY
816 movq %rdi, %rax
817# endif
818 xorl %ecx, %ecx
819 .p2align 4,, 8
820L(zfill_less_8):
821 cmpl $3, %edx
822 jb L(zfill_less_3)
823 movl %ecx, (%rdi)
824 movl %ecx, -3(%rdi, %rdx)
825# ifdef USE_AS_STPCPY
826 ret
827# endif
828
829L(ret_4_7):
830# ifdef USE_AS_STPCPY
831L(ret_8_15):
832 movl %edx, %eax
833 adcq %rdi, %rax
834# endif
835 ret
836
837 .p2align 4,, 4
838L(zfill_less_3):
839 testl %edx, %edx
840 jz L(zfill_1)
841 movw %cx, (%rdi)
842L(zfill_1):
843 movb %cl, (%rdi, %rdx)
844 ret
845
846 .p2align 4,, 8
847L(copy_0_3):
848 vmovd %VMM_128(0), %r8d
849 testl %edx, %edx
850 jz L(copy_1)
851 movw %r8w, (%rdi)
852 cmpl %ecx, %edx
853 ja L(zfill_from_1)
854 movzbl (%rsi, %rdx), %r8d
855# ifdef USE_AS_STPCPY
856 movl %edx, %eax
857 adcq %rdi, %rax
858 movb %r8b, (%rdi, %rdx)
859 ret
860# endif
861
862L(copy_1):
863# ifdef USE_AS_STPCPY
864 movl %edx, %eax
865 cmpl %ecx, %edx
866 adcq %rdi, %rax
867# endif
868# ifdef USE_AS_WCSCPY
869 vmovd %VMM_128(0), (%rdi)
870# else
871 movb %r8b, (%rdi, %rdx)
872# endif
873 ret
874# endif
875
876
877# ifndef USE_AS_WCSCPY
878 .p2align 4,, 8
879L(zfill_from_1):
880# ifdef USE_AS_STPCPY
881 leaq (%rdi, %rcx), %rax
882# endif
883 movw $0, -1(%rdi, %rdx)
884 ret
885# endif
886
887 .p2align 4,, 4
888L(zero_len):
889 incq %rdx
890 jne L(best_effort_strncpy)
891 movq %rdi, %rax
892 ret
893# endif
894
895
896 .p2align 4,, 4
897 .p2align 6,, 8
898L(page_cross):
899 movq %rsi, %rax
900 andq $(VEC_SIZE * -1), %rax
901 VPCMPEQ (%rax), %VZERO, %k0
902 KMOV %k0, %VRCX
903# ifdef USE_AS_WCSCPY
904 movl %esi, %r8d
905 shrl $2, %r8d
906 andl $(CHAR_PER_VEC - 1), %r8d
907 shrx %VR8, %VRCX, %VRCX
908# else
909 shrx %VRSI, %VRCX, %VRCX
910# endif
911
912 /* Compute amount of bytes we checked. */
913 subl %esi, %eax
914 andl $(VEC_SIZE - 1), %eax
915# ifdef USE_AS_WCSCPY
916 shrl $2, %eax
917# endif
918
919 /* If rax > rdx then we are finishing the copy at the end of the
920 page. */
921 cmpq %rax, %rdx
922 jb L(page_cross_small)
923
924
925 /* If rcx is non-zero then continue. */
926 test %VRCX, %VRCX
927 jz L(page_cross_continue)
928
929 /* We found zero-CHAR so need to copy then zfill (we know we
930 didn't cover all of length here). */
931 bsf %VRCX, %VRCX
932L(movsb_and_zfill):
933 incl %ecx
934 subq %rcx, %rdx
935# ifdef USE_AS_STPCPY
936 leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
937# else
938 movq %rdi, %rax
939# endif
940
941 REP_MOVS
942# ifdef USE_AS_WCSCPY
943 movl $0, (%rdi)
944# else
945 movb $0, (%rdi)
946# endif
947 jmp L(zfill_from_page_cross)
948
949L(page_cross_small):
950 tzcnt %VRCX, %VRCX
951 cmpl %ecx, %edx
952 jbe L(page_cross_copy_only)
953
954 /* Do a zfill of the tail before copying. */
955 movq %rdi, %r9
956 xorl %eax, %eax
957
958 movl %ecx, %r8d
959
960 subl %ecx, %edx
961 leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
962 movl %edx, %ecx
963 REP_STOS
964 movq %r9, %rdi
965 movl %r8d, %edx
966L(page_cross_copy_only):
967 leal 1(%rdx), %ecx
968# ifdef USE_AS_STPCPY
969# ifdef USE_AS_WCSCPY
970 adcl $0, %edx
971 leaq (%rdi, %rdx, CHAR_SIZE), %rax
972# else
973 movl %edx, %eax
974 adcq %rdi, %rax
975# endif
976# else
977 movq %rdi, %rax
978# endif
979 REP_MOVS
980 ret
981
982
983L(best_effort_strncpy):
984 movq %rdx, %rcx
985 xorl %eax, %eax
986 movq %rdi, %r8
987 /* The length is >= 2^63. We very much so expect to segfault at
988 rep stos. If that doesn't happen then just strcpy to finish.
989 */
990 REP_STOS
991 movq %r8, %rdi
992 jmp OVERFLOW_STRCPY
993END(STRNCPY)
994#endif
995

source code of glibc/sysdeps/x86_64/multiarch/strncpy-evex.S