1/* {wcs|str}ncat with 256/512-bit EVEX.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23 /* Use evex-masked stores for small sizes. Turned off at the
24 moment. */
25# define USE_EVEX_MASKED_STORE 0
26
27# include <sysdep.h>
28
29# ifndef VEC_SIZE
30# include "x86-evex256-vecs.h"
31# endif
32
33# ifndef STRNCAT
34# define STRNCAT __strncat_evex
35# endif
36
37
38# ifdef USE_AS_WCSCPY
39# define MOVCHAR movl
40# define VMOVU_MASK vmovdqu32
41# define VPMIN vpminud
42# define VPTESTN vptestnmd
43# define VPTEST vptestmd
44# define VPCMPEQ vpcmpeqd
45# define CHAR_SIZE 4
46
47# define REP_MOVS rep movsd
48
49# define VMASK_REG VR10
50# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst
51
52# define USE_WIDE_CHAR
53# else
54# define MOVCHAR movb
55# define VMOVU_MASK vmovdqu8
56# define VPMIN vpminub
57# define VPTESTN vptestnmb
58# define VPTEST vptestmb
59# define VPCMPEQ vpcmpeqb
60# define CHAR_SIZE 1
61
62# define REP_MOVS rep movsb
63
64# define VMASK_REG VRCX
65# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst
66
67# endif
68
69# include "strncpy-or-cat-overflow-def.h"
70
71# include "reg-macros.h"
72
73
74# define VZERO VMM(7)
75# define VZERO_128 VMM_128(7)
76
77# define PAGE_SIZE 4096
78# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
79
80 .section SECTION(.text), "ax", @progbits
81ENTRY(STRNCAT)
82# ifdef __ILP32__
83 /* Clear the upper 32 bits. */
84 movl %edx, %edx
85# endif
86
87 movq %rdi, %rax
88
89 /* NB: It's safe to filter out zero-length strings WITHOUT
90 setting null-term. Destination MUST be a null-terminated
91 string so essentially the work is already done. */
92# ifdef USE_AS_WCSCPY
93 leaq -1(%rdx), %rcx
94 shrq $56, %rcx
95 jnz L(zero_len)
96# else
97 test %rdx, %rdx
98 jle L(zero_len)
99# endif
100
101# include "strcat-strlen-evex.h.S"
102
103 movl %esi, %ecx
104 andl $(PAGE_SIZE - 1), %ecx
105 cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
106 ja L(page_cross)
107L(page_cross_continue):
108 VMOVU (%rsi), %VMM(0)
109 VPTESTN %VMM(0), %VMM(0), %k0
110
111 /* If USE_EVEX_MASK_STORE is enabled then we just handle length
112 <= CHAR_PER_VEC with masked instructions (which have
113 potential for dramatically bad perf if dst splits a page and
114 is not in the TLB). */
115# if USE_EVEX_MASKED_STORE
116 KMOV %k0, %VRCX
117 FIND_FIRST_ONE (VRCX, VR8)
118 cmpq %r8, %rdx
119 jbe L(less_1x_vec)
120
121 test %VRCX, %VRCX
122 jz L(more_1x_vec)
123
124 blsmsk %VRCX, %VRCX
125 KMOV %VRCX, %k1
126 VMOVU_MASK %VMM(0), (%rdi){%k1}
127 ret
128
129L(less_1x_vec):
130 mov $-1, %VRCX
131 bzhi %VRDX, %VRCX, %VRCX
132 KMOV %VRCX, %k1
133 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
134 VMOVU_MASK %VMM(0), (%rdi){%k1}
135
136 ret
137# else
138 KMOV %k0, %VMASK_REG
139 /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
140 %VMASK_REG, %VRCX` for wcsncat. */
141 FIND_FIRST_ONE (VMASK_REG, VRCX)
142 cmpq %rcx, %rdx
143 jbe L(less_1x_vec)
144
145 /* If there were no zero-CHARs (rcx was zero before
146 FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
147 cmpl $CHAR_PER_VEC, %ecx
148 je L(more_1x_vec)
149
150 movl %ecx, %edx
151
152L(less_1x_vec):
153# if VEC_SIZE == 64
154 cmpl $(32 / CHAR_SIZE), %edx
155 jae L(copy_32_63)
156# endif
157
158 cmpl $(16 / CHAR_SIZE), %edx
159 jae L(copy_16_31)
160
161
162 cmpl $(8 / CHAR_SIZE), %edx
163 jae L(copy_8_15)
164
165# ifdef USE_AS_WCSCPY
166 vmovd %VMM_128(0), (%rdi)
167 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
168 ret
169# else
170
171 cmpl $4, %edx
172 jae L(copy_4_7)
173
174 movzbl (%rsi), %ecx
175 cmpl $1, %edx
176 jbe L(set_null_term)
177
178 movzwl 1(%rsi), %esi
179 movw %si, 1(%rdi)
180
181 .p2align 4,, 1
182L(set_null_term):
183 movb %cl, (%rdi)
184 MOVCHAR $0, (%rdi, %rdx)
185 ret
186# endif
187
188# if VEC_SIZE == 64
189 .p2align 4,, 6
190L(copy_32_63):
191 VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
192 VMOVU %VMM_256(0), (%rdi)
193 VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
194 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
195 ret
196# endif
197 .p2align 4,, 6
198L(copy_16_31):
199 /* Use xmm1 explicitly here as it won't require a `vzeroupper`
200 and will save code size. */
201 vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
202 VMOVU %VMM_128(0), (%rdi)
203 vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
204 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
205 ret
206
207 .p2align 4,, 2
208L(copy_8_15):
209 movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
210 vmovq %VMM_128(0), (%rdi)
211 movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
212 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
213 ret
214
215# ifndef USE_AS_WCSCPY
216 .p2align 4,, 12
217L(copy_4_7):
218 movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
219 vmovd %VMM_128(0), (%rdi)
220 movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
221 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
222 ret
223# endif
224
225# endif
226 .p2align 4,, 4
227L(zero_len):
228# ifdef USE_AS_WCSCPY
229 test %rdx, %rdx
230# endif
231 jne OVERFLOW_STRCAT
232 ret
233
234 .p2align 4,, 8
235L(more_1x_vec):
236 VMOVU %VMM(0), (%rdi)
237
238 /* We are going to align rsi here so will need to be able to re-
239 adjust rdi/rdx afterwards. NB: We filtered out huge lengths
240 so rsi + rdx * CHAR_SIZE cannot overflow. */
241
242 leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
243 subq %rsi, %rdi
244 andq $-(VEC_SIZE), %rsi
245L(loop_last_4x_vec):
246 addq %rsi, %rdi
247 subq %rsi, %rdx
248# ifdef USE_AS_WCSCPY
249 shrq $2, %rdx
250# endif
251
252 /* Will need this regardless. */
253 VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
254 VPTESTN %VMM(1), %VMM(1), %k0
255 KMOV %k0, %VMASK_REG
256
257 cmpq $(CHAR_PER_VEC * 2), %rdx
258 ja L(more_2x_vec)
259
260L(last_2x_vec):
261 FIND_FIRST_ONE (VMASK_REG, VRCX)
262 cmpl %ecx, %edx
263 jbe L(ret_vec_x1_len)
264
265 /* If there were no zero-CHARs (rcx was zero before
266 FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
267 cmpl $CHAR_PER_VEC, %ecx
268 jne L(ret_vec_x1)
269
270 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
271 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
272 VPTESTN %VMM(2), %VMM(2), %k0
273 KMOV %k0, %VRCX
274 addl $-CHAR_PER_VEC, %edx
275 bzhi %VRDX, %VRCX, %VR8
276 jz L(ret_vec_x2_len)
277L(ret_vec_x2):
278 bsf %VRCX, %VRDX
279L(ret_vec_x2_len):
280 VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
281 MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
282 VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
283 ret
284
285 .p2align 4,, 4
286L(ret_vec_x1_len):
287 movl %edx, %ecx
288L(ret_vec_x1):
289 VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
290 MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
291 VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
292 VZEROUPPER_RETURN
293
294
295 .p2align 4,, 8
296L(last_4x_vec):
297 addl $-(CHAR_PER_VEC * 4), %edx
298 VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
299 VPTESTN %VMM(1), %VMM(1), %k0
300 KMOV %k0, %VMASK_REG
301 subq $-(VEC_SIZE * 4), %rsi
302 subq $-(VEC_SIZE * 4), %rdi
303 cmpl $(CHAR_PER_VEC * 2), %edx
304 jbe L(last_2x_vec)
305 .p2align 4,, 8
306L(more_2x_vec):
307# ifdef USE_AS_WCSCPY
308 xorl %ecx, %ecx
309# endif
310 bsf %VMASK_REG, %VRCX
311 jnz L(ret_vec_x1)
312
313 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
314 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
315 VPTESTN %VMM(2), %VMM(2), %k0
316 KMOV %k0, %VRCX
317 test %VRCX, %VRCX
318 jnz L(ret_vec_x2)
319
320 VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
321 VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
322 VPTESTN %VMM(3), %VMM(3), %k0
323 KMOV %k0, %VMASK_REG
324
325 cmpq $(CHAR_PER_VEC * 4), %rdx
326 ja L(more_4x_vec)
327
328 /* Adjust length before going to L(ret_vec_x3_len) or
329 L(ret_vec_x3). */
330 addl $(CHAR_PER_VEC * -2), %edx
331
332 FIND_FIRST_ONE (VMASK_REG, VRCX)
333 cmpl %ecx, %edx
334 jbe L(ret_vec_x3_len)
335
336 /* If there were no zero-CHARs (rcx was zero before
337 FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
338 cmpl $CHAR_PER_VEC, %ecx
339 jne L(ret_vec_x3)
340
341 VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
342 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
343 VPTESTN %VMM(4), %VMM(4), %k0
344 KMOV %k0, %VRCX
345 addl $-CHAR_PER_VEC, %edx
346 bzhi %VRDX, %VRCX, %VR8
347 jz L(ret_vec_x4_len)
348L(ret_vec_x4):
349 bsf %VRCX, %VRDX
350L(ret_vec_x4_len):
351 VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
352 MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
353 VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
354 ret
355
356 .p2align 4,, 4
357L(ret_vec_x3_len):
358 movl %edx, %ecx
359L(ret_vec_x3):
360 VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
361 MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
362 VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
363 ret
364
365 .p2align 4,, 8
366L(more_4x_vec):
367# ifdef USE_AS_WCSCPY
368 xorl %ecx, %ecx
369# endif
370 bsf %VMASK_REG, %VRCX
371 jnz L(ret_vec_x3)
372
373 VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
374 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
375 VPTESTN %VMM(4), %VMM(4), %k0
376 KMOV %k0, %VRCX
377 test %VRCX, %VRCX
378 jnz L(ret_vec_x4)
379
380 VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
381
382 /* Check if we are near the end before aligning. */
383 cmpq $(CHAR_PER_VEC * 8), %rdx
384 jbe L(last_4x_vec)
385
386
387 /* Add rsi to rdx (length) before aligning rsi. NB: Since we
388 filtered out huge lengths this cannot overflow. */
389# ifdef USE_AS_WCSCPY
390 leaq (%rsi, %rdx, CHAR_SIZE), %rdx
391# else
392 addq %rsi, %rdx
393# endif
394
395 /* Subtract rsi from rdi before aligning (add back will have
396 correct rdi for aligned rsi). */
397 subq %rsi, %rdi
398 subq $-(VEC_SIZE * 5), %rsi
399 andq $(VEC_SIZE * -4), %rsi
400
401 /* Load first half of the loop before entry. */
402 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
403 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
404 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
405 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
406
407 VPMIN %VMM(0), %VMM(1), %VMM(4)
408 VPMIN %VMM(2), %VMM(3), %VMM(6)
409 VPTESTN %VMM(4), %VMM(4), %k2
410 VPTESTN %VMM(6), %VMM(6), %k4
411
412 /* Offset rsi by VEC_SIZE so that we can jump to
413 L(loop_last_4x_vec). */
414 addq $-(VEC_SIZE), %rsi
415 KORTEST %k2, %k4
416 jnz L(loop_4x_done)
417
418 /* Store loop end in r9. */
419 leaq -(VEC_SIZE * 5)(%rdx), %r9
420
421 .p2align 4,, 11
422L(loop_4x_vec):
423 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
424 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
425 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
426 VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
427
428 subq $(VEC_SIZE * -4), %rsi
429 cmpq %rsi, %r9
430 jbe L(loop_last_4x_vec)
431
432 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
433 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
434 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
435 VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
436
437 VPMIN %VMM(0), %VMM(1), %VMM(4)
438 VPMIN %VMM(2), %VMM(3), %VMM(6)
439 VPTESTN %VMM(4), %VMM(4), %k2
440 VPTESTN %VMM(6), %VMM(6), %k4
441 KORTEST %k2, %k4
442 jz L(loop_4x_vec)
443
444L(loop_4x_done):
445 VPTESTN %VMM(0), %VMM(0), %k0
446 KMOV %k0, %VRCX
447 /* Restore rdi (dst). */
448 addq %rsi, %rdi
449
450 /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
451 test with bsf. */
452 bsf %VRCX, %VRCX
453 jnz L(ret_vec_x1)
454 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
455
456 KMOV %k2, %VRCX
457 test %VRCX, %VRCX
458 jnz L(ret_vec_x2)
459 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
460
461 VPTESTN %VMM(2), %VMM(2), %k0
462 KMOV %k0, %VRCX
463 bsf %VRCX, %VRCX
464 jnz L(ret_vec_x3)
465 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
466
467 KMOV %k4, %VRCX
468 bsf %VRCX, %VRCX
469 VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
470 VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
471 ret
472
473
474 .p2align 4,, 4
475L(page_cross):
476 movq %rsi, %r8
477 andq $(VEC_SIZE * -1), %r8
478 VPCMPEQ (%r8), %VZERO, %k0
479
480# ifdef USE_AS_WCSCPY
481 KMOV %k0, %VR9
482 shrl $2, %ecx
483 andl $(CHAR_PER_VEC - 1), %ecx
484 shrx %VRCX, %VR9, %VRCX
485# else
486 KMOV %k0, %VRCX
487 shrx %VRSI, %VRCX, %VRCX
488# endif
489
490 subl %esi, %r8d
491 andl $(VEC_SIZE - 1), %r8d
492# ifdef USE_AS_WCSCPY
493 shrl $2, %r8d
494# endif
495 cmpq %r8, %rdx
496 jbe L(page_cross_small)
497 /* Optimizing more for space as this is very cold code. This
498 saves 2x cache lines. */
499
500 /* This adds once to the later result which will get correct
501 copy bounds. NB: this can never zero-out a non-zero RCX as
502 to be in the page cross case rsi cannot be aligned and we
503 already right-shift rcx by the misalignment. */
504 shl %VRCX
505 jz L(page_cross_continue)
506 bsf %VRCX, %VRCX
507 REP_MOVS
508 ret
509
510L(page_cross_small):
511 tzcnt %VRCX, %VRCX
512 jz L(page_cross_setz)
513 cmpl %edx, %ecx
514 cmova %edx, %ecx
515
516# ifdef USE_AS_WCSCPY
517 rep movsd
518# else
519 rep movsb
520# endif
521L(page_cross_setz):
522 MOVCHAR $0, (%rdi)
523 ret
524END(STRNCAT)
525#endif
526

source code of glibc/sysdeps/x86_64/multiarch/strncat-evex.S