1 | /* strcpy with AVX2 |
2 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # ifndef USE_AS_STRCAT |
22 | # include <sysdep.h> |
23 | |
24 | # ifndef STRCPY |
25 | # define STRCPY __strcpy_avx2 |
26 | # endif |
27 | |
28 | # endif |
29 | |
30 | /* Number of bytes in a vector register */ |
31 | # ifndef VEC_SIZE |
32 | # define VEC_SIZE 32 |
33 | # endif |
34 | |
35 | # ifndef VZEROUPPER |
36 | # define VZEROUPPER vzeroupper |
37 | # endif |
38 | |
39 | # ifndef SECTION |
40 | # define SECTION(p) p##.avx |
41 | # endif |
42 | |
43 | /* zero register */ |
44 | #define xmmZ xmm0 |
45 | #define ymmZ ymm0 |
46 | |
47 | /* mask register */ |
48 | #define ymmM ymm1 |
49 | |
50 | # ifndef USE_AS_STRCAT |
51 | |
52 | .section SECTION(.text),"ax" ,@progbits |
53 | ENTRY (STRCPY) |
54 | # ifdef USE_AS_STRNCPY |
55 | mov %RDX_LP, %R8_LP |
56 | test %R8_LP, %R8_LP |
57 | jz L(ExitZero) |
58 | # endif |
59 | mov %rsi, %rcx |
60 | # ifndef USE_AS_STPCPY |
61 | mov %rdi, %rax /* save result */ |
62 | # endif |
63 | |
64 | # endif |
65 | |
66 | vpxor %xmmZ, %xmmZ, %xmmZ |
67 | |
68 | and $((VEC_SIZE * 4) - 1), %ecx |
69 | cmp $(VEC_SIZE * 2), %ecx |
70 | jbe L(SourceStringAlignmentLessTwoVecSize) |
71 | |
72 | and $-VEC_SIZE, %rsi |
73 | and $(VEC_SIZE - 1), %ecx |
74 | |
75 | vpcmpeqb (%rsi), %ymmZ, %ymmM |
76 | vpmovmskb %ymmM, %edx |
77 | shr %cl, %rdx |
78 | |
79 | # ifdef USE_AS_STRNCPY |
80 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
81 | mov $VEC_SIZE, %r10 |
82 | sub %rcx, %r10 |
83 | cmp %r10, %r8 |
84 | # else |
85 | mov $(VEC_SIZE + 1), %r10 |
86 | sub %rcx, %r10 |
87 | cmp %r10, %r8 |
88 | # endif |
89 | jbe L(CopyVecSizeTailCase2OrCase3) |
90 | # endif |
91 | test %edx, %edx |
92 | jnz L(CopyVecSizeTail) |
93 | |
94 | vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 |
95 | vpmovmskb %ymm2, %edx |
96 | |
97 | # ifdef USE_AS_STRNCPY |
98 | add $VEC_SIZE, %r10 |
99 | cmp %r10, %r8 |
100 | jbe L(CopyTwoVecSizeCase2OrCase3) |
101 | # endif |
102 | test %edx, %edx |
103 | jnz L(CopyTwoVecSize) |
104 | |
105 | vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ |
106 | vmovdqu %ymm2, (%rdi) |
107 | |
108 | /* If source address alignment != destination address alignment */ |
109 | .p2align 4 |
110 | L(UnalignVecSizeBoth): |
111 | sub %rcx, %rdi |
112 | # ifdef USE_AS_STRNCPY |
113 | add %rcx, %r8 |
114 | sbb %rcx, %rcx |
115 | or %rcx, %r8 |
116 | # endif |
117 | mov $VEC_SIZE, %rcx |
118 | vmovdqa (%rsi, %rcx), %ymm2 |
119 | vmovdqu %ymm2, (%rdi, %rcx) |
120 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
121 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
122 | vpmovmskb %ymmM, %edx |
123 | add $VEC_SIZE, %rcx |
124 | # ifdef USE_AS_STRNCPY |
125 | sub $(VEC_SIZE * 3), %r8 |
126 | jbe L(CopyVecSizeCase2OrCase3) |
127 | # endif |
128 | test %edx, %edx |
129 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
130 | jnz L(CopyVecSizeUnalignedVec2) |
131 | # else |
132 | jnz L(CopyVecSize) |
133 | # endif |
134 | |
135 | vmovdqu %ymm2, (%rdi, %rcx) |
136 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 |
137 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
138 | vpmovmskb %ymmM, %edx |
139 | add $VEC_SIZE, %rcx |
140 | # ifdef USE_AS_STRNCPY |
141 | sub $VEC_SIZE, %r8 |
142 | jbe L(CopyVecSizeCase2OrCase3) |
143 | # endif |
144 | test %edx, %edx |
145 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
146 | jnz L(CopyVecSizeUnalignedVec3) |
147 | # else |
148 | jnz L(CopyVecSize) |
149 | # endif |
150 | |
151 | vmovdqu %ymm3, (%rdi, %rcx) |
152 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 |
153 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
154 | vpmovmskb %ymmM, %edx |
155 | add $VEC_SIZE, %rcx |
156 | # ifdef USE_AS_STRNCPY |
157 | sub $VEC_SIZE, %r8 |
158 | jbe L(CopyVecSizeCase2OrCase3) |
159 | # endif |
160 | test %edx, %edx |
161 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
162 | jnz L(CopyVecSizeUnalignedVec4) |
163 | # else |
164 | jnz L(CopyVecSize) |
165 | # endif |
166 | |
167 | vmovdqu %ymm4, (%rdi, %rcx) |
168 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
169 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
170 | vpmovmskb %ymmM, %edx |
171 | add $VEC_SIZE, %rcx |
172 | # ifdef USE_AS_STRNCPY |
173 | sub $VEC_SIZE, %r8 |
174 | jbe L(CopyVecSizeCase2OrCase3) |
175 | # endif |
176 | test %edx, %edx |
177 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
178 | jnz L(CopyVecSizeUnalignedVec2) |
179 | # else |
180 | jnz L(CopyVecSize) |
181 | # endif |
182 | |
183 | vmovdqu %ymm2, (%rdi, %rcx) |
184 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
185 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
186 | vpmovmskb %ymmM, %edx |
187 | add $VEC_SIZE, %rcx |
188 | # ifdef USE_AS_STRNCPY |
189 | sub $VEC_SIZE, %r8 |
190 | jbe L(CopyVecSizeCase2OrCase3) |
191 | # endif |
192 | test %edx, %edx |
193 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
194 | jnz L(CopyVecSizeUnalignedVec2) |
195 | # else |
196 | jnz L(CopyVecSize) |
197 | # endif |
198 | |
199 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 |
200 | vmovdqu %ymm2, (%rdi, %rcx) |
201 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
202 | vpmovmskb %ymmM, %edx |
203 | add $VEC_SIZE, %rcx |
204 | # ifdef USE_AS_STRNCPY |
205 | sub $VEC_SIZE, %r8 |
206 | jbe L(CopyVecSizeCase2OrCase3) |
207 | # endif |
208 | test %edx, %edx |
209 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
210 | jnz L(CopyVecSizeUnalignedVec3) |
211 | # else |
212 | jnz L(CopyVecSize) |
213 | # endif |
214 | |
215 | vmovdqu %ymm3, (%rdi, %rcx) |
216 | mov %rsi, %rdx |
217 | lea VEC_SIZE(%rsi, %rcx), %rsi |
218 | and $-(VEC_SIZE * 4), %rsi |
219 | sub %rsi, %rdx |
220 | sub %rdx, %rdi |
221 | # ifdef USE_AS_STRNCPY |
222 | lea (VEC_SIZE * 8)(%r8, %rdx), %r8 |
223 | # endif |
224 | L(UnalignedFourVecSizeLoop): |
225 | vmovdqa (%rsi), %ymm4 |
226 | vmovdqa VEC_SIZE(%rsi), %ymm5 |
227 | vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 |
228 | vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 |
229 | vpminub %ymm5, %ymm4, %ymm2 |
230 | vpminub %ymm7, %ymm6, %ymm3 |
231 | vpminub %ymm2, %ymm3, %ymm3 |
232 | vpcmpeqb %ymmM, %ymm3, %ymm3 |
233 | vpmovmskb %ymm3, %edx |
234 | # ifdef USE_AS_STRNCPY |
235 | sub $(VEC_SIZE * 4), %r8 |
236 | jbe L(UnalignedLeaveCase2OrCase3) |
237 | # endif |
238 | test %edx, %edx |
239 | jnz L(UnalignedFourVecSizeLeave) |
240 | |
241 | L(UnalignedFourVecSizeLoop_start): |
242 | add $(VEC_SIZE * 4), %rdi |
243 | add $(VEC_SIZE * 4), %rsi |
244 | vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) |
245 | vmovdqa (%rsi), %ymm4 |
246 | vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) |
247 | vmovdqa VEC_SIZE(%rsi), %ymm5 |
248 | vpminub %ymm5, %ymm4, %ymm2 |
249 | vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) |
250 | vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 |
251 | vmovdqu %ymm7, -VEC_SIZE(%rdi) |
252 | vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 |
253 | vpminub %ymm7, %ymm6, %ymm3 |
254 | vpminub %ymm2, %ymm3, %ymm3 |
255 | vpcmpeqb %ymmM, %ymm3, %ymm3 |
256 | vpmovmskb %ymm3, %edx |
257 | # ifdef USE_AS_STRNCPY |
258 | sub $(VEC_SIZE * 4), %r8 |
259 | jbe L(UnalignedLeaveCase2OrCase3) |
260 | # endif |
261 | test %edx, %edx |
262 | jz L(UnalignedFourVecSizeLoop_start) |
263 | |
264 | L(UnalignedFourVecSizeLeave): |
265 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
266 | vpmovmskb %ymmM, %edx |
267 | test %edx, %edx |
268 | jnz L(CopyVecSizeUnaligned_0) |
269 | |
270 | vpcmpeqb %ymm5, %ymmZ, %ymmM |
271 | vpmovmskb %ymmM, %ecx |
272 | test %ecx, %ecx |
273 | jnz L(CopyVecSizeUnaligned_16) |
274 | |
275 | vpcmpeqb %ymm6, %ymmZ, %ymmM |
276 | vpmovmskb %ymmM, %edx |
277 | test %edx, %edx |
278 | jnz L(CopyVecSizeUnaligned_32) |
279 | |
280 | vpcmpeqb %ymm7, %ymmZ, %ymmM |
281 | vpmovmskb %ymmM, %ecx |
282 | bsf %ecx, %edx |
283 | vmovdqu %ymm4, (%rdi) |
284 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
285 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
286 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
287 | # ifdef USE_AS_STPCPY |
288 | lea (VEC_SIZE * 3)(%rdi, %rdx), %rax |
289 | # endif |
290 | vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) |
291 | add $(VEC_SIZE - 1), %r8 |
292 | sub %rdx, %r8 |
293 | lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi |
294 | jmp L(StrncpyFillTailWithZero) |
295 | # else |
296 | add $(VEC_SIZE * 3), %rsi |
297 | add $(VEC_SIZE * 3), %rdi |
298 | jmp L(CopyVecSizeExit) |
299 | # endif |
300 | |
301 | /* If source address alignment == destination address alignment */ |
302 | |
303 | L(SourceStringAlignmentLessTwoVecSize): |
304 | vmovdqu (%rsi), %ymm3 |
305 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
306 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
307 | vpmovmskb %ymmM, %edx |
308 | |
309 | # ifdef USE_AS_STRNCPY |
310 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
311 | cmp $VEC_SIZE, %r8 |
312 | # else |
313 | cmp $(VEC_SIZE + 1), %r8 |
314 | # endif |
315 | jbe L(CopyVecSizeTail1Case2OrCase3) |
316 | # endif |
317 | test %edx, %edx |
318 | jnz L(CopyVecSizeTail1) |
319 | |
320 | vmovdqu %ymm3, (%rdi) |
321 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
322 | vpmovmskb %ymmM, %edx |
323 | |
324 | # ifdef USE_AS_STRNCPY |
325 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
326 | cmp $(VEC_SIZE * 2), %r8 |
327 | # else |
328 | cmp $((VEC_SIZE * 2) + 1), %r8 |
329 | # endif |
330 | jbe L(CopyTwoVecSize1Case2OrCase3) |
331 | # endif |
332 | test %edx, %edx |
333 | jnz L(CopyTwoVecSize1) |
334 | |
335 | and $-VEC_SIZE, %rsi |
336 | and $(VEC_SIZE - 1), %ecx |
337 | jmp L(UnalignVecSizeBoth) |
338 | |
339 | /*------End of main part with loops---------------------*/ |
340 | |
341 | /* Case1 */ |
342 | |
343 | # if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) |
344 | .p2align 4 |
345 | L(CopyVecSize): |
346 | add %rcx, %rdi |
347 | # endif |
348 | L(CopyVecSizeTail): |
349 | add %rcx, %rsi |
350 | L(CopyVecSizeTail1): |
351 | bsf %edx, %edx |
352 | L(CopyVecSizeExit): |
353 | cmp $32, %edx |
354 | jae L(Exit32_63) |
355 | cmp $16, %edx |
356 | jae L(Exit16_31) |
357 | cmp $8, %edx |
358 | jae L(Exit8_15) |
359 | cmp $4, %edx |
360 | jae L(Exit4_7) |
361 | cmp $3, %edx |
362 | je L(Exit3) |
363 | cmp $1, %edx |
364 | ja L(Exit2) |
365 | je L(Exit1) |
366 | movb $0, (%rdi) |
367 | # ifdef USE_AS_STPCPY |
368 | lea (%rdi), %rax |
369 | # endif |
370 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
371 | sub $1, %r8 |
372 | lea 1(%rdi), %rdi |
373 | jnz L(StrncpyFillTailWithZero) |
374 | # endif |
375 | L(return_vzeroupper): |
376 | ZERO_UPPER_VEC_REGISTERS_RETURN |
377 | |
378 | .p2align 4 |
379 | L(CopyTwoVecSize1): |
380 | add $VEC_SIZE, %rsi |
381 | add $VEC_SIZE, %rdi |
382 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
383 | sub $VEC_SIZE, %r8 |
384 | # endif |
385 | jmp L(CopyVecSizeTail1) |
386 | |
387 | .p2align 4 |
388 | L(CopyTwoVecSize): |
389 | bsf %edx, %edx |
390 | add %rcx, %rsi |
391 | add $VEC_SIZE, %edx |
392 | sub %ecx, %edx |
393 | jmp L(CopyVecSizeExit) |
394 | |
395 | .p2align 4 |
396 | L(CopyVecSizeUnaligned_0): |
397 | bsf %edx, %edx |
398 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
399 | # ifdef USE_AS_STPCPY |
400 | lea (%rdi, %rdx), %rax |
401 | # endif |
402 | vmovdqu %ymm4, (%rdi) |
403 | add $((VEC_SIZE * 4) - 1), %r8 |
404 | sub %rdx, %r8 |
405 | lea 1(%rdi, %rdx), %rdi |
406 | jmp L(StrncpyFillTailWithZero) |
407 | # else |
408 | jmp L(CopyVecSizeExit) |
409 | # endif |
410 | |
411 | .p2align 4 |
412 | L(CopyVecSizeUnaligned_16): |
413 | bsf %ecx, %edx |
414 | vmovdqu %ymm4, (%rdi) |
415 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
416 | # ifdef USE_AS_STPCPY |
417 | lea VEC_SIZE(%rdi, %rdx), %rax |
418 | # endif |
419 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
420 | add $((VEC_SIZE * 3) - 1), %r8 |
421 | sub %rdx, %r8 |
422 | lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi |
423 | jmp L(StrncpyFillTailWithZero) |
424 | # else |
425 | add $VEC_SIZE, %rsi |
426 | add $VEC_SIZE, %rdi |
427 | jmp L(CopyVecSizeExit) |
428 | # endif |
429 | |
430 | .p2align 4 |
431 | L(CopyVecSizeUnaligned_32): |
432 | bsf %edx, %edx |
433 | vmovdqu %ymm4, (%rdi) |
434 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
435 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
436 | # ifdef USE_AS_STPCPY |
437 | lea (VEC_SIZE * 2)(%rdi, %rdx), %rax |
438 | # endif |
439 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
440 | add $((VEC_SIZE * 2) - 1), %r8 |
441 | sub %rdx, %r8 |
442 | lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi |
443 | jmp L(StrncpyFillTailWithZero) |
444 | # else |
445 | add $(VEC_SIZE * 2), %rsi |
446 | add $(VEC_SIZE * 2), %rdi |
447 | jmp L(CopyVecSizeExit) |
448 | # endif |
449 | |
450 | # ifdef USE_AS_STRNCPY |
451 | # ifndef USE_AS_STRCAT |
452 | .p2align 4 |
453 | L(CopyVecSizeUnalignedVec6): |
454 | vmovdqu %ymm6, (%rdi, %rcx) |
455 | jmp L(CopyVecSizeVecExit) |
456 | |
457 | .p2align 4 |
458 | L(CopyVecSizeUnalignedVec5): |
459 | vmovdqu %ymm5, (%rdi, %rcx) |
460 | jmp L(CopyVecSizeVecExit) |
461 | |
462 | .p2align 4 |
463 | L(CopyVecSizeUnalignedVec4): |
464 | vmovdqu %ymm4, (%rdi, %rcx) |
465 | jmp L(CopyVecSizeVecExit) |
466 | |
467 | .p2align 4 |
468 | L(CopyVecSizeUnalignedVec3): |
469 | vmovdqu %ymm3, (%rdi, %rcx) |
470 | jmp L(CopyVecSizeVecExit) |
471 | # endif |
472 | |
473 | /* Case2 */ |
474 | |
475 | .p2align 4 |
476 | L(CopyVecSizeCase2): |
477 | add $VEC_SIZE, %r8 |
478 | add %rcx, %rdi |
479 | add %rcx, %rsi |
480 | bsf %edx, %edx |
481 | cmp %r8d, %edx |
482 | jb L(CopyVecSizeExit) |
483 | jmp L(StrncpyExit) |
484 | |
485 | .p2align 4 |
486 | L(CopyTwoVecSizeCase2): |
487 | add %rcx, %rsi |
488 | bsf %edx, %edx |
489 | add $VEC_SIZE, %edx |
490 | sub %ecx, %edx |
491 | cmp %r8d, %edx |
492 | jb L(CopyVecSizeExit) |
493 | jmp L(StrncpyExit) |
494 | |
495 | L(CopyVecSizeTailCase2): |
496 | add %rcx, %rsi |
497 | bsf %edx, %edx |
498 | cmp %r8d, %edx |
499 | jb L(CopyVecSizeExit) |
500 | jmp L(StrncpyExit) |
501 | |
502 | L(CopyVecSizeTail1Case2): |
503 | bsf %edx, %edx |
504 | cmp %r8d, %edx |
505 | jb L(CopyVecSizeExit) |
506 | jmp L(StrncpyExit) |
507 | |
508 | /* Case2 or Case3, Case3 */ |
509 | |
510 | .p2align 4 |
511 | L(CopyVecSizeCase2OrCase3): |
512 | test %rdx, %rdx |
513 | jnz L(CopyVecSizeCase2) |
514 | L(CopyVecSizeCase3): |
515 | add $VEC_SIZE, %r8 |
516 | add %rcx, %rdi |
517 | add %rcx, %rsi |
518 | jmp L(StrncpyExit) |
519 | |
520 | .p2align 4 |
521 | L(CopyTwoVecSizeCase2OrCase3): |
522 | test %rdx, %rdx |
523 | jnz L(CopyTwoVecSizeCase2) |
524 | add %rcx, %rsi |
525 | jmp L(StrncpyExit) |
526 | |
527 | .p2align 4 |
528 | L(CopyVecSizeTailCase2OrCase3): |
529 | test %rdx, %rdx |
530 | jnz L(CopyVecSizeTailCase2) |
531 | add %rcx, %rsi |
532 | jmp L(StrncpyExit) |
533 | |
534 | .p2align 4 |
535 | L(CopyTwoVecSize1Case2OrCase3): |
536 | add $VEC_SIZE, %rdi |
537 | add $VEC_SIZE, %rsi |
538 | sub $VEC_SIZE, %r8 |
539 | L(CopyVecSizeTail1Case2OrCase3): |
540 | test %rdx, %rdx |
541 | jnz L(CopyVecSizeTail1Case2) |
542 | jmp L(StrncpyExit) |
543 | # endif |
544 | |
545 | /*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ |
546 | |
547 | .p2align 4 |
548 | L(Exit1): |
549 | movzwl (%rsi), %edx |
550 | mov %dx, (%rdi) |
551 | # ifdef USE_AS_STPCPY |
552 | lea 1(%rdi), %rax |
553 | # endif |
554 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
555 | sub $2, %r8 |
556 | lea 2(%rdi), %rdi |
557 | jnz L(StrncpyFillTailWithZero) |
558 | # endif |
559 | VZEROUPPER_RETURN |
560 | |
561 | .p2align 4 |
562 | L(Exit2): |
563 | movzwl (%rsi), %ecx |
564 | mov %cx, (%rdi) |
565 | movb $0, 2(%rdi) |
566 | # ifdef USE_AS_STPCPY |
567 | lea 2(%rdi), %rax |
568 | # endif |
569 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
570 | sub $3, %r8 |
571 | lea 3(%rdi), %rdi |
572 | jnz L(StrncpyFillTailWithZero) |
573 | # endif |
574 | VZEROUPPER_RETURN |
575 | |
576 | .p2align 4 |
577 | L(Exit3): |
578 | mov (%rsi), %edx |
579 | mov %edx, (%rdi) |
580 | # ifdef USE_AS_STPCPY |
581 | lea 3(%rdi), %rax |
582 | # endif |
583 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
584 | sub $4, %r8 |
585 | lea 4(%rdi), %rdi |
586 | jnz L(StrncpyFillTailWithZero) |
587 | # endif |
588 | VZEROUPPER_RETURN |
589 | |
590 | .p2align 4 |
591 | L(Exit4_7): |
592 | mov (%rsi), %ecx |
593 | mov %ecx, (%rdi) |
594 | mov -3(%rsi, %rdx), %ecx |
595 | mov %ecx, -3(%rdi, %rdx) |
596 | # ifdef USE_AS_STPCPY |
597 | lea (%rdi, %rdx), %rax |
598 | # endif |
599 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
600 | sub %rdx, %r8 |
601 | sub $1, %r8 |
602 | lea 1(%rdi, %rdx), %rdi |
603 | jnz L(StrncpyFillTailWithZero) |
604 | # endif |
605 | VZEROUPPER_RETURN |
606 | |
607 | .p2align 4 |
608 | L(Exit8_15): |
609 | mov (%rsi), %rcx |
610 | mov -7(%rsi, %rdx), %r9 |
611 | mov %rcx, (%rdi) |
612 | mov %r9, -7(%rdi, %rdx) |
613 | # ifdef USE_AS_STPCPY |
614 | lea (%rdi, %rdx), %rax |
615 | # endif |
616 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
617 | sub %rdx, %r8 |
618 | sub $1, %r8 |
619 | lea 1(%rdi, %rdx), %rdi |
620 | jnz L(StrncpyFillTailWithZero) |
621 | # endif |
622 | VZEROUPPER_RETURN |
623 | |
624 | .p2align 4 |
625 | L(Exit16_31): |
626 | vmovdqu (%rsi), %xmm2 |
627 | vmovdqu -15(%rsi, %rdx), %xmm3 |
628 | vmovdqu %xmm2, (%rdi) |
629 | vmovdqu %xmm3, -15(%rdi, %rdx) |
630 | # ifdef USE_AS_STPCPY |
631 | lea (%rdi, %rdx), %rax |
632 | # endif |
633 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
634 | sub %rdx, %r8 |
635 | sub $1, %r8 |
636 | lea 1(%rdi, %rdx), %rdi |
637 | jnz L(StrncpyFillTailWithZero) |
638 | # endif |
639 | VZEROUPPER_RETURN |
640 | |
641 | .p2align 4 |
642 | L(Exit32_63): |
643 | vmovdqu (%rsi), %ymm2 |
644 | vmovdqu -31(%rsi, %rdx), %ymm3 |
645 | vmovdqu %ymm2, (%rdi) |
646 | vmovdqu %ymm3, -31(%rdi, %rdx) |
647 | # ifdef USE_AS_STPCPY |
648 | lea (%rdi, %rdx), %rax |
649 | # endif |
650 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
651 | sub %rdx, %r8 |
652 | sub $1, %r8 |
653 | lea 1(%rdi, %rdx), %rdi |
654 | jnz L(StrncpyFillTailWithZero) |
655 | # endif |
656 | VZEROUPPER_RETURN |
657 | |
658 | # ifdef USE_AS_STRNCPY |
659 | |
660 | .p2align 4 |
661 | L(StrncpyExit1): |
662 | movzbl (%rsi), %edx |
663 | mov %dl, (%rdi) |
664 | # ifdef USE_AS_STPCPY |
665 | lea 1(%rdi), %rax |
666 | # endif |
667 | # ifdef USE_AS_STRCAT |
668 | movb $0, 1(%rdi) |
669 | # endif |
670 | VZEROUPPER_RETURN |
671 | |
672 | .p2align 4 |
673 | L(StrncpyExit2): |
674 | movzwl (%rsi), %edx |
675 | mov %dx, (%rdi) |
676 | # ifdef USE_AS_STPCPY |
677 | lea 2(%rdi), %rax |
678 | # endif |
679 | # ifdef USE_AS_STRCAT |
680 | movb $0, 2(%rdi) |
681 | # endif |
682 | VZEROUPPER_RETURN |
683 | |
684 | .p2align 4 |
685 | L(StrncpyExit3_4): |
686 | movzwl (%rsi), %ecx |
687 | movzwl -2(%rsi, %r8), %edx |
688 | mov %cx, (%rdi) |
689 | mov %dx, -2(%rdi, %r8) |
690 | # ifdef USE_AS_STPCPY |
691 | lea (%rdi, %r8), %rax |
692 | # endif |
693 | # ifdef USE_AS_STRCAT |
694 | movb $0, (%rdi, %r8) |
695 | # endif |
696 | VZEROUPPER_RETURN |
697 | |
698 | .p2align 4 |
699 | L(StrncpyExit5_8): |
700 | mov (%rsi), %ecx |
701 | mov -4(%rsi, %r8), %edx |
702 | mov %ecx, (%rdi) |
703 | mov %edx, -4(%rdi, %r8) |
704 | # ifdef USE_AS_STPCPY |
705 | lea (%rdi, %r8), %rax |
706 | # endif |
707 | # ifdef USE_AS_STRCAT |
708 | movb $0, (%rdi, %r8) |
709 | # endif |
710 | VZEROUPPER_RETURN |
711 | |
712 | .p2align 4 |
713 | L(StrncpyExit9_16): |
714 | mov (%rsi), %rcx |
715 | mov -8(%rsi, %r8), %rdx |
716 | mov %rcx, (%rdi) |
717 | mov %rdx, -8(%rdi, %r8) |
718 | # ifdef USE_AS_STPCPY |
719 | lea (%rdi, %r8), %rax |
720 | # endif |
721 | # ifdef USE_AS_STRCAT |
722 | movb $0, (%rdi, %r8) |
723 | # endif |
724 | VZEROUPPER_RETURN |
725 | |
726 | .p2align 4 |
727 | L(StrncpyExit17_32): |
728 | vmovdqu (%rsi), %xmm2 |
729 | vmovdqu -16(%rsi, %r8), %xmm3 |
730 | vmovdqu %xmm2, (%rdi) |
731 | vmovdqu %xmm3, -16(%rdi, %r8) |
732 | # ifdef USE_AS_STPCPY |
733 | lea (%rdi, %r8), %rax |
734 | # endif |
735 | # ifdef USE_AS_STRCAT |
736 | movb $0, (%rdi, %r8) |
737 | # endif |
738 | VZEROUPPER_RETURN |
739 | |
740 | .p2align 4 |
741 | L(StrncpyExit33_64): |
742 | /* 0/32, 31/16 */ |
743 | vmovdqu (%rsi), %ymm2 |
744 | vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 |
745 | vmovdqu %ymm2, (%rdi) |
746 | vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) |
747 | # ifdef USE_AS_STPCPY |
748 | lea (%rdi, %r8), %rax |
749 | # endif |
750 | # ifdef USE_AS_STRCAT |
751 | movb $0, (%rdi, %r8) |
752 | # endif |
753 | VZEROUPPER_RETURN |
754 | |
755 | .p2align 4 |
756 | L(StrncpyExit65): |
757 | /* 0/32, 32/32, 64/1 */ |
758 | vmovdqu (%rsi), %ymm2 |
759 | vmovdqu 32(%rsi), %ymm3 |
760 | mov 64(%rsi), %cl |
761 | vmovdqu %ymm2, (%rdi) |
762 | vmovdqu %ymm3, 32(%rdi) |
763 | mov %cl, 64(%rdi) |
764 | # ifdef USE_AS_STPCPY |
765 | lea 65(%rdi), %rax |
766 | # endif |
767 | # ifdef USE_AS_STRCAT |
768 | movb $0, 65(%rdi) |
769 | # endif |
770 | VZEROUPPER_RETURN |
771 | |
772 | # ifndef USE_AS_STRCAT |
773 | |
774 | .p2align 4 |
775 | L(Fill1): |
776 | mov %dl, (%rdi) |
777 | VZEROUPPER_RETURN |
778 | |
779 | .p2align 4 |
780 | L(Fill2): |
781 | mov %dx, (%rdi) |
782 | VZEROUPPER_RETURN |
783 | |
784 | .p2align 4 |
785 | L(Fill3_4): |
786 | mov %dx, (%rdi) |
787 | mov %dx, -2(%rdi, %r8) |
788 | VZEROUPPER_RETURN |
789 | |
790 | .p2align 4 |
791 | L(Fill5_8): |
792 | mov %edx, (%rdi) |
793 | mov %edx, -4(%rdi, %r8) |
794 | VZEROUPPER_RETURN |
795 | |
796 | .p2align 4 |
797 | L(Fill9_16): |
798 | mov %rdx, (%rdi) |
799 | mov %rdx, -8(%rdi, %r8) |
800 | VZEROUPPER_RETURN |
801 | |
802 | .p2align 4 |
803 | L(Fill17_32): |
804 | vmovdqu %xmmZ, (%rdi) |
805 | vmovdqu %xmmZ, -16(%rdi, %r8) |
806 | VZEROUPPER_RETURN |
807 | |
808 | .p2align 4 |
809 | L(CopyVecSizeUnalignedVec2): |
810 | vmovdqu %ymm2, (%rdi, %rcx) |
811 | |
812 | .p2align 4 |
813 | L(CopyVecSizeVecExit): |
814 | bsf %edx, %edx |
815 | add $(VEC_SIZE - 1), %r8 |
816 | add %rcx, %rdi |
817 | # ifdef USE_AS_STPCPY |
818 | lea (%rdi, %rdx), %rax |
819 | # endif |
820 | sub %rdx, %r8 |
821 | lea 1(%rdi, %rdx), %rdi |
822 | |
823 | .p2align 4 |
824 | L(StrncpyFillTailWithZero): |
825 | xor %edx, %edx |
826 | sub $VEC_SIZE, %r8 |
827 | jbe L(StrncpyFillExit) |
828 | |
829 | vmovdqu %ymmZ, (%rdi) |
830 | add $VEC_SIZE, %rdi |
831 | |
832 | mov %rdi, %rsi |
833 | and $(VEC_SIZE - 1), %esi |
834 | sub %rsi, %rdi |
835 | add %rsi, %r8 |
836 | sub $(VEC_SIZE * 4), %r8 |
837 | jb L(StrncpyFillLessFourVecSize) |
838 | |
839 | L(StrncpyFillLoopVmovdqa): |
840 | vmovdqa %ymmZ, (%rdi) |
841 | vmovdqa %ymmZ, VEC_SIZE(%rdi) |
842 | vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) |
843 | vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) |
844 | add $(VEC_SIZE * 4), %rdi |
845 | sub $(VEC_SIZE * 4), %r8 |
846 | jae L(StrncpyFillLoopVmovdqa) |
847 | |
848 | L(StrncpyFillLessFourVecSize): |
849 | add $(VEC_SIZE * 2), %r8 |
850 | jl L(StrncpyFillLessTwoVecSize) |
851 | vmovdqa %ymmZ, (%rdi) |
852 | vmovdqa %ymmZ, VEC_SIZE(%rdi) |
853 | add $(VEC_SIZE * 2), %rdi |
854 | sub $VEC_SIZE, %r8 |
855 | jl L(StrncpyFillExit) |
856 | vmovdqa %ymmZ, (%rdi) |
857 | add $VEC_SIZE, %rdi |
858 | jmp L(Fill) |
859 | |
860 | .p2align 4 |
861 | L(StrncpyFillLessTwoVecSize): |
862 | add $VEC_SIZE, %r8 |
863 | jl L(StrncpyFillExit) |
864 | vmovdqa %ymmZ, (%rdi) |
865 | add $VEC_SIZE, %rdi |
866 | jmp L(Fill) |
867 | |
868 | .p2align 4 |
869 | L(StrncpyFillExit): |
870 | add $VEC_SIZE, %r8 |
871 | L(Fill): |
872 | cmp $17, %r8d |
873 | jae L(Fill17_32) |
874 | cmp $9, %r8d |
875 | jae L(Fill9_16) |
876 | cmp $5, %r8d |
877 | jae L(Fill5_8) |
878 | cmp $3, %r8d |
879 | jae L(Fill3_4) |
880 | cmp $1, %r8d |
881 | ja L(Fill2) |
882 | je L(Fill1) |
883 | VZEROUPPER_RETURN |
884 | |
885 | /* end of ifndef USE_AS_STRCAT */ |
886 | # endif |
887 | |
888 | .p2align 4 |
889 | L(UnalignedLeaveCase2OrCase3): |
890 | test %rdx, %rdx |
891 | jnz L(UnalignedFourVecSizeLeaveCase2) |
892 | L(UnalignedFourVecSizeLeaveCase3): |
893 | lea (VEC_SIZE * 4)(%r8), %rcx |
894 | and $-VEC_SIZE, %rcx |
895 | add $(VEC_SIZE * 3), %r8 |
896 | jl L(CopyVecSizeCase3) |
897 | vmovdqu %ymm4, (%rdi) |
898 | sub $VEC_SIZE, %r8 |
899 | jb L(CopyVecSizeCase3) |
900 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
901 | sub $VEC_SIZE, %r8 |
902 | jb L(CopyVecSizeCase3) |
903 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
904 | sub $VEC_SIZE, %r8 |
905 | jb L(CopyVecSizeCase3) |
906 | vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) |
907 | # ifdef USE_AS_STPCPY |
908 | lea (VEC_SIZE * 4)(%rdi), %rax |
909 | # endif |
910 | # ifdef USE_AS_STRCAT |
911 | movb $0, (VEC_SIZE * 4)(%rdi) |
912 | # endif |
913 | VZEROUPPER_RETURN |
914 | |
915 | .p2align 4 |
916 | L(UnalignedFourVecSizeLeaveCase2): |
917 | xor %ecx, %ecx |
918 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
919 | vpmovmskb %ymmM, %edx |
920 | add $(VEC_SIZE * 3), %r8 |
921 | jle L(CopyVecSizeCase2OrCase3) |
922 | test %edx, %edx |
923 | # ifndef USE_AS_STRCAT |
924 | jnz L(CopyVecSizeUnalignedVec4) |
925 | # else |
926 | jnz L(CopyVecSize) |
927 | # endif |
928 | vpcmpeqb %ymm5, %ymmZ, %ymmM |
929 | vpmovmskb %ymmM, %edx |
930 | vmovdqu %ymm4, (%rdi) |
931 | add $VEC_SIZE, %rcx |
932 | sub $VEC_SIZE, %r8 |
933 | jbe L(CopyVecSizeCase2OrCase3) |
934 | test %edx, %edx |
935 | # ifndef USE_AS_STRCAT |
936 | jnz L(CopyVecSizeUnalignedVec5) |
937 | # else |
938 | jnz L(CopyVecSize) |
939 | # endif |
940 | |
941 | vpcmpeqb %ymm6, %ymmZ, %ymmM |
942 | vpmovmskb %ymmM, %edx |
943 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
944 | add $VEC_SIZE, %rcx |
945 | sub $VEC_SIZE, %r8 |
946 | jbe L(CopyVecSizeCase2OrCase3) |
947 | test %edx, %edx |
948 | # ifndef USE_AS_STRCAT |
949 | jnz L(CopyVecSizeUnalignedVec6) |
950 | # else |
951 | jnz L(CopyVecSize) |
952 | # endif |
953 | |
954 | vpcmpeqb %ymm7, %ymmZ, %ymmM |
955 | vpmovmskb %ymmM, %edx |
956 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
957 | lea VEC_SIZE(%rdi, %rcx), %rdi |
958 | lea VEC_SIZE(%rsi, %rcx), %rsi |
959 | bsf %edx, %edx |
960 | cmp %r8d, %edx |
961 | jb L(CopyVecSizeExit) |
962 | L(StrncpyExit): |
963 | cmp $65, %r8d |
964 | je L(StrncpyExit65) |
965 | cmp $33, %r8d |
966 | jae L(StrncpyExit33_64) |
967 | cmp $17, %r8d |
968 | jae L(StrncpyExit17_32) |
969 | cmp $9, %r8d |
970 | jae L(StrncpyExit9_16) |
971 | cmp $5, %r8d |
972 | jae L(StrncpyExit5_8) |
973 | cmp $3, %r8d |
974 | jae L(StrncpyExit3_4) |
975 | cmp $1, %r8d |
976 | ja L(StrncpyExit2) |
977 | je L(StrncpyExit1) |
978 | # ifdef USE_AS_STPCPY |
979 | mov %rdi, %rax |
980 | # endif |
981 | # ifdef USE_AS_STRCAT |
982 | movb $0, (%rdi) |
983 | # endif |
984 | VZEROUPPER_RETURN |
985 | |
986 | .p2align 4 |
987 | L(ExitZero): |
988 | # ifndef USE_AS_STRCAT |
989 | mov %rdi, %rax |
990 | # endif |
991 | VZEROUPPER_RETURN |
992 | |
993 | # endif |
994 | |
995 | # ifndef USE_AS_STRCAT |
996 | END (STRCPY) |
997 | # else |
998 | END (STRCAT) |
999 | # endif |
1000 | #endif |
1001 | |