1/* wcscpy with SSSE3
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20# include <sysdep.h>
21
22 .section .text.ssse3,"ax",@progbits
23ENTRY (__wcscpy_ssse3)
24
25 mov %rsi, %rcx
26 mov %rdi, %rdx
27
28 cmpl $0, (%rcx)
29 jz L(Exit4)
30 cmpl $0, 4(%rcx)
31 jz L(Exit8)
32 cmpl $0, 8(%rcx)
33 jz L(Exit12)
34 cmpl $0, 12(%rcx)
35 jz L(Exit16)
36
37 lea 16(%rcx), %rsi
38 and $-16, %rsi
39
40 pxor %xmm0, %xmm0
41 mov (%rcx), %r9
42 mov %r9, (%rdx)
43
44 pcmpeqd (%rsi), %xmm0
45 mov 8(%rcx), %r9
46 mov %r9, 8(%rdx)
47
48 pmovmskb %xmm0, %rax
49 sub %rcx, %rsi
50
51 test %rax, %rax
52 jnz L(CopyFrom1To16Bytes)
53
54 mov %rdx, %rax
55 lea 16(%rdx), %rdx
56 and $-16, %rdx
57 sub %rdx, %rax
58 sub %rax, %rcx
59 mov %rcx, %rax
60 and $0xf, %rax
61 mov $0, %rsi
62
63/* case: rcx_offset == rdx_offset */
64
65 jz L(Align16Both)
66
67 cmp $4, %rax
68 je L(Shl4)
69 cmp $8, %rax
70 je L(Shl8)
71 jmp L(Shl12)
72
73L(Align16Both):
74 movaps (%rcx), %xmm1
75 movaps 16(%rcx), %xmm2
76 movaps %xmm1, (%rdx)
77 pcmpeqd %xmm2, %xmm0
78 pmovmskb %xmm0, %rax
79 lea 16(%rsi), %rsi
80
81 test %rax, %rax
82 jnz L(CopyFrom1To16Bytes)
83
84 movaps 16(%rcx, %rsi), %xmm3
85 movaps %xmm2, (%rdx, %rsi)
86 pcmpeqd %xmm3, %xmm0
87 pmovmskb %xmm0, %rax
88 lea 16(%rsi), %rsi
89
90 test %rax, %rax
91 jnz L(CopyFrom1To16Bytes)
92
93 movaps 16(%rcx, %rsi), %xmm4
94 movaps %xmm3, (%rdx, %rsi)
95 pcmpeqd %xmm4, %xmm0
96 pmovmskb %xmm0, %rax
97 lea 16(%rsi), %rsi
98
99 test %rax, %rax
100 jnz L(CopyFrom1To16Bytes)
101
102 movaps 16(%rcx, %rsi), %xmm1
103 movaps %xmm4, (%rdx, %rsi)
104 pcmpeqd %xmm1, %xmm0
105 pmovmskb %xmm0, %rax
106 lea 16(%rsi), %rsi
107
108 test %rax, %rax
109 jnz L(CopyFrom1To16Bytes)
110
111 movaps 16(%rcx, %rsi), %xmm2
112 movaps %xmm1, (%rdx, %rsi)
113 pcmpeqd %xmm2, %xmm0
114 pmovmskb %xmm0, %rax
115 lea 16(%rsi), %rsi
116
117 test %rax, %rax
118 jnz L(CopyFrom1To16Bytes)
119
120 movaps 16(%rcx, %rsi), %xmm3
121 movaps %xmm2, (%rdx, %rsi)
122 pcmpeqd %xmm3, %xmm0
123 pmovmskb %xmm0, %rax
124 lea 16(%rsi), %rsi
125
126 test %rax, %rax
127 jnz L(CopyFrom1To16Bytes)
128
129 movaps %xmm3, (%rdx, %rsi)
130 mov %rcx, %rax
131 lea 16(%rcx, %rsi), %rcx
132 and $-0x40, %rcx
133 sub %rcx, %rax
134 sub %rax, %rdx
135
136 mov $-0x40, %rsi
137
138 .p2align 4
139L(Aligned64Loop):
140 movaps (%rcx), %xmm2
141 movaps %xmm2, %xmm4
142 movaps 16(%rcx), %xmm5
143 movaps 32(%rcx), %xmm3
144 movaps %xmm3, %xmm6
145 movaps 48(%rcx), %xmm7
146 pminub %xmm5, %xmm2
147 pminub %xmm7, %xmm3
148 pminub %xmm2, %xmm3
149 pcmpeqd %xmm0, %xmm3
150 pmovmskb %xmm3, %rax
151 lea 64(%rdx), %rdx
152 lea 64(%rcx), %rcx
153 test %rax, %rax
154 jnz L(Aligned64Leave)
155 movaps %xmm4, -64(%rdx)
156 movaps %xmm5, -48(%rdx)
157 movaps %xmm6, -32(%rdx)
158 movaps %xmm7, -16(%rdx)
159 jmp L(Aligned64Loop)
160
161L(Aligned64Leave):
162 pcmpeqd %xmm4, %xmm0
163 pmovmskb %xmm0, %rax
164 test %rax, %rax
165 jnz L(CopyFrom1To16Bytes)
166
167 pcmpeqd %xmm5, %xmm0
168
169 pmovmskb %xmm0, %rax
170 movaps %xmm4, -64(%rdx)
171 test %rax, %rax
172 lea 16(%rsi), %rsi
173 jnz L(CopyFrom1To16Bytes)
174
175 pcmpeqd %xmm6, %xmm0
176
177 pmovmskb %xmm0, %rax
178 movaps %xmm5, -48(%rdx)
179 test %rax, %rax
180 lea 16(%rsi), %rsi
181 jnz L(CopyFrom1To16Bytes)
182
183 movaps %xmm6, -32(%rdx)
184 pcmpeqd %xmm7, %xmm0
185
186 pmovmskb %xmm0, %rax
187 lea 16(%rsi), %rsi
188 test %rax, %rax
189 jnz L(CopyFrom1To16Bytes)
190
191 mov $-0x40, %rsi
192 movaps %xmm7, -16(%rdx)
193 jmp L(Aligned64Loop)
194
195 .p2align 4
196L(Shl4):
197 movaps -4(%rcx), %xmm1
198 movaps 12(%rcx), %xmm2
199L(Shl4Start):
200 pcmpeqd %xmm2, %xmm0
201 pmovmskb %xmm0, %rax
202 movaps %xmm2, %xmm3
203
204 test %rax, %rax
205 jnz L(Shl4LoopExit)
206
207 palignr $4, %xmm1, %xmm2
208 movaps %xmm2, (%rdx)
209 movaps 28(%rcx), %xmm2
210
211 pcmpeqd %xmm2, %xmm0
212 lea 16(%rdx), %rdx
213 pmovmskb %xmm0, %rax
214 lea 16(%rcx), %rcx
215 movaps %xmm2, %xmm1
216
217 test %rax, %rax
218 jnz L(Shl4LoopExit)
219
220 palignr $4, %xmm3, %xmm2
221 movaps %xmm2, (%rdx)
222 movaps 28(%rcx), %xmm2
223
224 pcmpeqd %xmm2, %xmm0
225 lea 16(%rdx), %rdx
226 pmovmskb %xmm0, %rax
227 lea 16(%rcx), %rcx
228 movaps %xmm2, %xmm3
229
230 test %rax, %rax
231 jnz L(Shl4LoopExit)
232
233 palignr $4, %xmm1, %xmm2
234 movaps %xmm2, (%rdx)
235 movaps 28(%rcx), %xmm2
236
237 pcmpeqd %xmm2, %xmm0
238 lea 16(%rdx), %rdx
239 pmovmskb %xmm0, %rax
240 lea 16(%rcx), %rcx
241
242 test %rax, %rax
243 jnz L(Shl4LoopExit)
244
245 palignr $4, %xmm3, %xmm2
246 movaps %xmm2, (%rdx)
247 lea 28(%rcx), %rcx
248 lea 16(%rdx), %rdx
249
250 mov %rcx, %rax
251 and $-0x40, %rcx
252 sub %rcx, %rax
253 lea -12(%rcx), %rcx
254 sub %rax, %rdx
255
256 movaps -4(%rcx), %xmm1
257
258 .p2align 4
259L(Shl4LoopStart):
260 movaps 12(%rcx), %xmm2
261 movaps 28(%rcx), %xmm3
262 movaps %xmm3, %xmm6
263 movaps 44(%rcx), %xmm4
264 movaps %xmm4, %xmm7
265 movaps 60(%rcx), %xmm5
266 pminub %xmm2, %xmm6
267 pminub %xmm5, %xmm7
268 pminub %xmm6, %xmm7
269 pcmpeqd %xmm0, %xmm7
270 pmovmskb %xmm7, %rax
271 movaps %xmm5, %xmm7
272 palignr $4, %xmm4, %xmm5
273 test %rax, %rax
274 palignr $4, %xmm3, %xmm4
275 jnz L(Shl4Start)
276
277 palignr $4, %xmm2, %xmm3
278 lea 64(%rcx), %rcx
279 palignr $4, %xmm1, %xmm2
280 movaps %xmm7, %xmm1
281 movaps %xmm5, 48(%rdx)
282 movaps %xmm4, 32(%rdx)
283 movaps %xmm3, 16(%rdx)
284 movaps %xmm2, (%rdx)
285 lea 64(%rdx), %rdx
286 jmp L(Shl4LoopStart)
287
288L(Shl4LoopExit):
289 movdqu -4(%rcx), %xmm1
290 mov $12, %rsi
291 movdqu %xmm1, -4(%rdx)
292 jmp L(CopyFrom1To16Bytes)
293
294 .p2align 4
295L(Shl8):
296 movaps -8(%rcx), %xmm1
297 movaps 8(%rcx), %xmm2
298L(Shl8Start):
299 pcmpeqd %xmm2, %xmm0
300 pmovmskb %xmm0, %rax
301 movaps %xmm2, %xmm3
302
303 test %rax, %rax
304 jnz L(Shl8LoopExit)
305
306 palignr $8, %xmm1, %xmm2
307 movaps %xmm2, (%rdx)
308 movaps 24(%rcx), %xmm2
309
310 pcmpeqd %xmm2, %xmm0
311 lea 16(%rdx), %rdx
312 pmovmskb %xmm0, %rax
313 lea 16(%rcx), %rcx
314 movaps %xmm2, %xmm1
315
316 test %rax, %rax
317 jnz L(Shl8LoopExit)
318
319 palignr $8, %xmm3, %xmm2
320 movaps %xmm2, (%rdx)
321 movaps 24(%rcx), %xmm2
322
323 pcmpeqd %xmm2, %xmm0
324 lea 16(%rdx), %rdx
325 pmovmskb %xmm0, %rax
326 lea 16(%rcx), %rcx
327 movaps %xmm2, %xmm3
328
329 test %rax, %rax
330 jnz L(Shl8LoopExit)
331
332 palignr $8, %xmm1, %xmm2
333 movaps %xmm2, (%rdx)
334 movaps 24(%rcx), %xmm2
335
336 pcmpeqd %xmm2, %xmm0
337 lea 16(%rdx), %rdx
338 pmovmskb %xmm0, %rax
339 lea 16(%rcx), %rcx
340
341 test %rax, %rax
342 jnz L(Shl8LoopExit)
343
344 palignr $8, %xmm3, %xmm2
345 movaps %xmm2, (%rdx)
346 lea 24(%rcx), %rcx
347 lea 16(%rdx), %rdx
348
349 mov %rcx, %rax
350 and $-0x40, %rcx
351 sub %rcx, %rax
352 lea -8(%rcx), %rcx
353 sub %rax, %rdx
354
355 movaps -8(%rcx), %xmm1
356
357 .p2align 4
358L(Shl8LoopStart):
359 movaps 8(%rcx), %xmm2
360 movaps 24(%rcx), %xmm3
361 movaps %xmm3, %xmm6
362 movaps 40(%rcx), %xmm4
363 movaps %xmm4, %xmm7
364 movaps 56(%rcx), %xmm5
365 pminub %xmm2, %xmm6
366 pminub %xmm5, %xmm7
367 pminub %xmm6, %xmm7
368 pcmpeqd %xmm0, %xmm7
369 pmovmskb %xmm7, %rax
370 movaps %xmm5, %xmm7
371 palignr $8, %xmm4, %xmm5
372 test %rax, %rax
373 palignr $8, %xmm3, %xmm4
374 jnz L(Shl8Start)
375
376 palignr $8, %xmm2, %xmm3
377 lea 64(%rcx), %rcx
378 palignr $8, %xmm1, %xmm2
379 movaps %xmm7, %xmm1
380 movaps %xmm5, 48(%rdx)
381 movaps %xmm4, 32(%rdx)
382 movaps %xmm3, 16(%rdx)
383 movaps %xmm2, (%rdx)
384 lea 64(%rdx), %rdx
385 jmp L(Shl8LoopStart)
386
387L(Shl8LoopExit):
388 mov (%rcx), %r9
389 mov $8, %rsi
390 mov %r9, (%rdx)
391 jmp L(CopyFrom1To16Bytes)
392
393 .p2align 4
394L(Shl12):
395 movaps -12(%rcx), %xmm1
396 movaps 4(%rcx), %xmm2
397L(Shl12Start):
398 pcmpeqd %xmm2, %xmm0
399 pmovmskb %xmm0, %rax
400 movaps %xmm2, %xmm3
401
402 test %rax, %rax
403 jnz L(Shl12LoopExit)
404
405 palignr $12, %xmm1, %xmm2
406 movaps %xmm2, (%rdx)
407 movaps 20(%rcx), %xmm2
408
409 pcmpeqd %xmm2, %xmm0
410 lea 16(%rdx), %rdx
411 pmovmskb %xmm0, %rax
412 lea 16(%rcx), %rcx
413 movaps %xmm2, %xmm1
414
415 test %rax, %rax
416 jnz L(Shl12LoopExit)
417
418 palignr $12, %xmm3, %xmm2
419 movaps %xmm2, (%rdx)
420 movaps 20(%rcx), %xmm2
421
422 pcmpeqd %xmm2, %xmm0
423 lea 16(%rdx), %rdx
424 pmovmskb %xmm0, %rax
425 lea 16(%rcx), %rcx
426 movaps %xmm2, %xmm3
427
428 test %rax, %rax
429 jnz L(Shl12LoopExit)
430
431 palignr $12, %xmm1, %xmm2
432 movaps %xmm2, (%rdx)
433 movaps 20(%rcx), %xmm2
434
435 pcmpeqd %xmm2, %xmm0
436 lea 16(%rdx), %rdx
437 pmovmskb %xmm0, %rax
438 lea 16(%rcx), %rcx
439
440 test %rax, %rax
441 jnz L(Shl12LoopExit)
442
443 palignr $12, %xmm3, %xmm2
444 movaps %xmm2, (%rdx)
445 lea 20(%rcx), %rcx
446 lea 16(%rdx), %rdx
447
448 mov %rcx, %rax
449 and $-0x40, %rcx
450 sub %rcx, %rax
451 lea -4(%rcx), %rcx
452 sub %rax, %rdx
453
454 movaps -12(%rcx), %xmm1
455
456 .p2align 4
457L(Shl12LoopStart):
458 movaps 4(%rcx), %xmm2
459 movaps 20(%rcx), %xmm3
460 movaps %xmm3, %xmm6
461 movaps 36(%rcx), %xmm4
462 movaps %xmm4, %xmm7
463 movaps 52(%rcx), %xmm5
464 pminub %xmm2, %xmm6
465 pminub %xmm5, %xmm7
466 pminub %xmm6, %xmm7
467 pcmpeqd %xmm0, %xmm7
468 pmovmskb %xmm7, %rax
469 movaps %xmm5, %xmm7
470 palignr $12, %xmm4, %xmm5
471 test %rax, %rax
472 palignr $12, %xmm3, %xmm4
473 jnz L(Shl12Start)
474 palignr $12, %xmm2, %xmm3
475 lea 64(%rcx), %rcx
476 palignr $12, %xmm1, %xmm2
477 movaps %xmm7, %xmm1
478 movaps %xmm5, 48(%rdx)
479 movaps %xmm4, 32(%rdx)
480 movaps %xmm3, 16(%rdx)
481 movaps %xmm2, (%rdx)
482 lea 64(%rdx), %rdx
483 jmp L(Shl12LoopStart)
484
485L(Shl12LoopExit):
486 mov (%rcx), %r9d
487 mov $4, %rsi
488 mov %r9d, (%rdx)
489 jmp L(CopyFrom1To16Bytes)
490
491 .p2align 4
492L(CopyFrom1To16Bytes):
493 add %rsi, %rdx
494 add %rsi, %rcx
495
496 test %al, %al
497 jz L(ExitHigh)
498 test $0x01, %al
499 jnz L(Exit4)
500
501 mov (%rcx), %rax
502 mov %rax, (%rdx)
503 mov %rdi, %rax
504 ret
505
506 .p2align 4
507L(ExitHigh):
508 test $0x01, %ah
509 jnz L(Exit12)
510
511 mov (%rcx), %rax
512 mov %rax, (%rdx)
513 mov 8(%rcx), %rax
514 mov %rax, 8(%rdx)
515 mov %rdi, %rax
516 ret
517
518 .p2align 4
519L(Exit4):
520 movl (%rcx), %eax
521 movl %eax, (%rdx)
522 mov %rdi, %rax
523 ret
524
525 .p2align 4
526L(Exit8):
527 mov (%rcx), %rax
528 mov %rax, (%rdx)
529 mov %rdi, %rax
530 ret
531
532 .p2align 4
533L(Exit12):
534 mov (%rcx), %rax
535 mov %rax, (%rdx)
536 mov 8(%rcx), %eax
537 mov %eax, 8(%rdx)
538 mov %rdi, %rax
539 ret
540
541 .p2align 4
542L(Exit16):
543 mov (%rcx), %rax
544 mov %rax, (%rdx)
545 mov 8(%rcx), %rax
546 mov %rax, 8(%rdx)
547 mov %rdi, %rax
548 ret
549
550END(__wcscpy_ssse3)
551#endif
552

source code of glibc/sysdeps/x86_64/multiarch/wcscpy-ssse3.S