1/* wcscpy with SSSE3
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20# include <sysdep.h>
21
22# define CFI_PUSH(REG) \
23 cfi_adjust_cfa_offset (4); \
24 cfi_rel_offset (REG, 0)
25
26# define CFI_POP(REG) \
27 cfi_adjust_cfa_offset (-4); \
28 cfi_restore (REG)
29
30# define PUSH(REG) pushl REG; CFI_PUSH (REG)
31# define POP(REG) popl REG; CFI_POP (REG)
32
33# define PARMS 4
34# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
35# define STR1 PARMS
36# define STR2 STR1+4
37# define LEN STR2+4
38
39 atom_text_section
40ENTRY (__wcscpy_ssse3)
41 mov STR1(%esp), %edx
42 mov STR2(%esp), %ecx
43
44 cmpl $0, (%ecx)
45 jz L(ExitTail4)
46 cmpl $0, 4(%ecx)
47 jz L(ExitTail8)
48 cmpl $0, 8(%ecx)
49 jz L(ExitTail12)
50 cmpl $0, 12(%ecx)
51 jz L(ExitTail16)
52
53 PUSH (%edi)
54 mov %edx, %edi
55 PUSH (%esi)
56 lea 16(%ecx), %esi
57
58 and $-16, %esi
59
60 pxor %xmm0, %xmm0
61 pcmpeqd (%esi), %xmm0
62 movdqu (%ecx), %xmm1
63 movdqu %xmm1, (%edx)
64
65 pmovmskb %xmm0, %eax
66 sub %ecx, %esi
67
68 test %eax, %eax
69 jnz L(CopyFrom1To16Bytes)
70
71 mov %edx, %eax
72 lea 16(%edx), %edx
73 and $-16, %edx
74 sub %edx, %eax
75
76 sub %eax, %ecx
77 mov %ecx, %eax
78 and $0xf, %eax
79 mov $0, %esi
80
81 jz L(Align16Both)
82 cmp $4, %eax
83 je L(Shl4)
84 cmp $8, %eax
85 je L(Shl8)
86 jmp L(Shl12)
87
88L(Align16Both):
89 movaps (%ecx), %xmm1
90 movaps 16(%ecx), %xmm2
91 movaps %xmm1, (%edx)
92 pcmpeqd %xmm2, %xmm0
93 pmovmskb %xmm0, %eax
94 lea 16(%esi), %esi
95
96 test %eax, %eax
97 jnz L(CopyFrom1To16Bytes)
98
99 movaps 16(%ecx, %esi), %xmm3
100 movaps %xmm2, (%edx, %esi)
101 pcmpeqd %xmm3, %xmm0
102 pmovmskb %xmm0, %eax
103 lea 16(%esi), %esi
104
105 test %eax, %eax
106 jnz L(CopyFrom1To16Bytes)
107
108 movaps 16(%ecx, %esi), %xmm4
109 movaps %xmm3, (%edx, %esi)
110 pcmpeqd %xmm4, %xmm0
111 pmovmskb %xmm0, %eax
112 lea 16(%esi), %esi
113
114 test %eax, %eax
115 jnz L(CopyFrom1To16Bytes)
116
117 movaps 16(%ecx, %esi), %xmm1
118 movaps %xmm4, (%edx, %esi)
119 pcmpeqd %xmm1, %xmm0
120 pmovmskb %xmm0, %eax
121 lea 16(%esi), %esi
122
123 test %eax, %eax
124 jnz L(CopyFrom1To16Bytes)
125
126 movaps 16(%ecx, %esi), %xmm2
127 movaps %xmm1, (%edx, %esi)
128 pcmpeqd %xmm2, %xmm0
129 pmovmskb %xmm0, %eax
130 lea 16(%esi), %esi
131
132 test %eax, %eax
133 jnz L(CopyFrom1To16Bytes)
134
135 movaps 16(%ecx, %esi), %xmm3
136 movaps %xmm2, (%edx, %esi)
137 pcmpeqd %xmm3, %xmm0
138 pmovmskb %xmm0, %eax
139 lea 16(%esi), %esi
140
141 test %eax, %eax
142 jnz L(CopyFrom1To16Bytes)
143
144 movaps %xmm3, (%edx, %esi)
145 mov %ecx, %eax
146 lea 16(%ecx, %esi), %ecx
147 and $-0x40, %ecx
148 sub %ecx, %eax
149 sub %eax, %edx
150
151 mov $-0x40, %esi
152
153L(Aligned64Loop):
154 movaps (%ecx), %xmm2
155 movaps 32(%ecx), %xmm3
156 movaps %xmm2, %xmm4
157 movaps 16(%ecx), %xmm5
158 movaps %xmm3, %xmm6
159 movaps 48(%ecx), %xmm7
160 pminub %xmm5, %xmm2
161 pminub %xmm7, %xmm3
162 pminub %xmm2, %xmm3
163 lea 64(%edx), %edx
164 pcmpeqd %xmm0, %xmm3
165 lea 64(%ecx), %ecx
166 pmovmskb %xmm3, %eax
167
168 test %eax, %eax
169 jnz L(Aligned64Leave)
170 movaps %xmm4, -64(%edx)
171 movaps %xmm5, -48(%edx)
172 movaps %xmm6, -32(%edx)
173 movaps %xmm7, -16(%edx)
174 jmp L(Aligned64Loop)
175
176L(Aligned64Leave):
177 pcmpeqd %xmm4, %xmm0
178 pmovmskb %xmm0, %eax
179 test %eax, %eax
180 jnz L(CopyFrom1To16Bytes)
181
182 pcmpeqd %xmm5, %xmm0
183 pmovmskb %xmm0, %eax
184 movaps %xmm4, -64(%edx)
185 test %eax, %eax
186 lea 16(%esi), %esi
187 jnz L(CopyFrom1To16Bytes)
188
189 pcmpeqd %xmm6, %xmm0
190 pmovmskb %xmm0, %eax
191 movaps %xmm5, -48(%edx)
192 test %eax, %eax
193 lea 16(%esi), %esi
194 jnz L(CopyFrom1To16Bytes)
195
196 movaps %xmm6, -32(%edx)
197 pcmpeqd %xmm7, %xmm0
198 pmovmskb %xmm0, %eax
199 test %eax, %eax
200 lea 16(%esi), %esi
201 jnz L(CopyFrom1To16Bytes)
202
203 mov $-0x40, %esi
204 movaps %xmm7, -16(%edx)
205 jmp L(Aligned64Loop)
206
207 .p2align 4
208L(Shl4):
209 movaps -4(%ecx), %xmm1
210 movaps 12(%ecx), %xmm2
211L(Shl4Start):
212 pcmpeqd %xmm2, %xmm0
213 pmovmskb %xmm0, %eax
214 movaps %xmm2, %xmm3
215
216 test %eax, %eax
217 jnz L(Shl4LoopExit)
218
219 palignr $4, %xmm1, %xmm2
220 movaps %xmm2, (%edx)
221 movaps 28(%ecx), %xmm2
222
223 pcmpeqd %xmm2, %xmm0
224 lea 16(%edx), %edx
225 pmovmskb %xmm0, %eax
226 lea 16(%ecx), %ecx
227 movaps %xmm2, %xmm1
228
229 test %eax, %eax
230 jnz L(Shl4LoopExit)
231
232 palignr $4, %xmm3, %xmm2
233 movaps %xmm2, (%edx)
234 movaps 28(%ecx), %xmm2
235
236 pcmpeqd %xmm2, %xmm0
237 lea 16(%edx), %edx
238 pmovmskb %xmm0, %eax
239 lea 16(%ecx), %ecx
240 movaps %xmm2, %xmm3
241
242 test %eax, %eax
243 jnz L(Shl4LoopExit)
244
245 palignr $4, %xmm1, %xmm2
246 movaps %xmm2, (%edx)
247 movaps 28(%ecx), %xmm2
248
249 pcmpeqd %xmm2, %xmm0
250 lea 16(%edx), %edx
251 pmovmskb %xmm0, %eax
252 lea 16(%ecx), %ecx
253
254 test %eax, %eax
255 jnz L(Shl4LoopExit)
256
257 palignr $4, %xmm3, %xmm2
258 movaps %xmm2, (%edx)
259 lea 28(%ecx), %ecx
260 lea 16(%edx), %edx
261
262 mov %ecx, %eax
263 and $-0x40, %ecx
264 sub %ecx, %eax
265 lea -12(%ecx), %ecx
266 sub %eax, %edx
267
268 movaps -4(%ecx), %xmm1
269
270L(Shl4LoopStart):
271 movaps 12(%ecx), %xmm2
272 movaps 28(%ecx), %xmm3
273 movaps %xmm3, %xmm6
274 movaps 44(%ecx), %xmm4
275 movaps %xmm4, %xmm7
276 movaps 60(%ecx), %xmm5
277 pminub %xmm2, %xmm6
278 pminub %xmm5, %xmm7
279 pminub %xmm6, %xmm7
280 pcmpeqd %xmm0, %xmm7
281 pmovmskb %xmm7, %eax
282 movaps %xmm5, %xmm7
283 palignr $4, %xmm4, %xmm5
284 test %eax, %eax
285 palignr $4, %xmm3, %xmm4
286 jnz L(Shl4Start)
287
288 palignr $4, %xmm2, %xmm3
289 lea 64(%ecx), %ecx
290 palignr $4, %xmm1, %xmm2
291 movaps %xmm7, %xmm1
292 movaps %xmm5, 48(%edx)
293 movaps %xmm4, 32(%edx)
294 movaps %xmm3, 16(%edx)
295 movaps %xmm2, (%edx)
296 lea 64(%edx), %edx
297 jmp L(Shl4LoopStart)
298
299L(Shl4LoopExit):
300 movlpd (%ecx), %xmm0
301 movl 8(%ecx), %esi
302 movlpd %xmm0, (%edx)
303 movl %esi, 8(%edx)
304 POP (%esi)
305 add $12, %edx
306 add $12, %ecx
307 test %al, %al
308 jz L(ExitHigh)
309 test $0x01, %al
310 jnz L(Exit4)
311 movlpd (%ecx), %xmm0
312 movlpd %xmm0, (%edx)
313 movl %edi, %eax
314 RETURN
315
316 CFI_PUSH (%esi)
317
318 .p2align 4
319L(Shl8):
320 movaps -8(%ecx), %xmm1
321 movaps 8(%ecx), %xmm2
322L(Shl8Start):
323 pcmpeqd %xmm2, %xmm0
324 pmovmskb %xmm0, %eax
325 movaps %xmm2, %xmm3
326
327 test %eax, %eax
328 jnz L(Shl8LoopExit)
329
330 palignr $8, %xmm1, %xmm2
331 movaps %xmm2, (%edx)
332 movaps 24(%ecx), %xmm2
333
334 pcmpeqd %xmm2, %xmm0
335 lea 16(%edx), %edx
336 pmovmskb %xmm0, %eax
337 lea 16(%ecx), %ecx
338 movaps %xmm2, %xmm1
339
340 test %eax, %eax
341 jnz L(Shl8LoopExit)
342
343 palignr $8, %xmm3, %xmm2
344 movaps %xmm2, (%edx)
345 movaps 24(%ecx), %xmm2
346
347 pcmpeqd %xmm2, %xmm0
348 lea 16(%edx), %edx
349 pmovmskb %xmm0, %eax
350 lea 16(%ecx), %ecx
351 movaps %xmm2, %xmm3
352
353 test %eax, %eax
354 jnz L(Shl8LoopExit)
355
356 palignr $8, %xmm1, %xmm2
357 movaps %xmm2, (%edx)
358 movaps 24(%ecx), %xmm2
359
360 pcmpeqd %xmm2, %xmm0
361 lea 16(%edx), %edx
362 pmovmskb %xmm0, %eax
363 lea 16(%ecx), %ecx
364
365 test %eax, %eax
366 jnz L(Shl8LoopExit)
367
368 palignr $8, %xmm3, %xmm2
369 movaps %xmm2, (%edx)
370 lea 24(%ecx), %ecx
371 lea 16(%edx), %edx
372
373 mov %ecx, %eax
374 and $-0x40, %ecx
375 sub %ecx, %eax
376 lea -8(%ecx), %ecx
377 sub %eax, %edx
378
379 movaps -8(%ecx), %xmm1
380
381L(Shl8LoopStart):
382 movaps 8(%ecx), %xmm2
383 movaps 24(%ecx), %xmm3
384 movaps %xmm3, %xmm6
385 movaps 40(%ecx), %xmm4
386 movaps %xmm4, %xmm7
387 movaps 56(%ecx), %xmm5
388 pminub %xmm2, %xmm6
389 pminub %xmm5, %xmm7
390 pminub %xmm6, %xmm7
391 pcmpeqd %xmm0, %xmm7
392 pmovmskb %xmm7, %eax
393 movaps %xmm5, %xmm7
394 palignr $8, %xmm4, %xmm5
395 test %eax, %eax
396 palignr $8, %xmm3, %xmm4
397 jnz L(Shl8Start)
398
399 palignr $8, %xmm2, %xmm3
400 lea 64(%ecx), %ecx
401 palignr $8, %xmm1, %xmm2
402 movaps %xmm7, %xmm1
403 movaps %xmm5, 48(%edx)
404 movaps %xmm4, 32(%edx)
405 movaps %xmm3, 16(%edx)
406 movaps %xmm2, (%edx)
407 lea 64(%edx), %edx
408 jmp L(Shl8LoopStart)
409
410L(Shl8LoopExit):
411 movlpd (%ecx), %xmm0
412 movlpd %xmm0, (%edx)
413 POP (%esi)
414 add $8, %edx
415 add $8, %ecx
416 test %al, %al
417 jz L(ExitHigh)
418 test $0x01, %al
419 jnz L(Exit4)
420 movlpd (%ecx), %xmm0
421 movlpd %xmm0, (%edx)
422 movl %edi, %eax
423 RETURN
424
425 CFI_PUSH (%esi)
426
427 .p2align 4
428L(Shl12):
429 movaps -12(%ecx), %xmm1
430 movaps 4(%ecx), %xmm2
431L(Shl12Start):
432 pcmpeqd %xmm2, %xmm0
433 pmovmskb %xmm0, %eax
434 movaps %xmm2, %xmm3
435
436 test %eax, %eax
437 jnz L(Shl12LoopExit)
438
439 palignr $12, %xmm1, %xmm2
440 movaps %xmm2, (%edx)
441 movaps 20(%ecx), %xmm2
442
443 pcmpeqd %xmm2, %xmm0
444 lea 16(%edx), %edx
445 pmovmskb %xmm0, %eax
446 lea 16(%ecx), %ecx
447 movaps %xmm2, %xmm1
448
449 test %eax, %eax
450 jnz L(Shl12LoopExit)
451
452 palignr $12, %xmm3, %xmm2
453 movaps %xmm2, (%edx)
454 movaps 20(%ecx), %xmm2
455
456 pcmpeqd %xmm2, %xmm0
457 lea 16(%edx), %edx
458 pmovmskb %xmm0, %eax
459 lea 16(%ecx), %ecx
460 movaps %xmm2, %xmm3
461
462 test %eax, %eax
463 jnz L(Shl12LoopExit)
464
465 palignr $12, %xmm1, %xmm2
466 movaps %xmm2, (%edx)
467 movaps 20(%ecx), %xmm2
468
469 pcmpeqd %xmm2, %xmm0
470 lea 16(%edx), %edx
471 pmovmskb %xmm0, %eax
472 lea 16(%ecx), %ecx
473
474 test %eax, %eax
475 jnz L(Shl12LoopExit)
476
477 palignr $12, %xmm3, %xmm2
478 movaps %xmm2, (%edx)
479 lea 20(%ecx), %ecx
480 lea 16(%edx), %edx
481
482 mov %ecx, %eax
483 and $-0x40, %ecx
484 sub %ecx, %eax
485 lea -4(%ecx), %ecx
486 sub %eax, %edx
487
488 movaps -12(%ecx), %xmm1
489
490L(Shl12LoopStart):
491 movaps 4(%ecx), %xmm2
492 movaps 20(%ecx), %xmm3
493 movaps %xmm3, %xmm6
494 movaps 36(%ecx), %xmm4
495 movaps %xmm4, %xmm7
496 movaps 52(%ecx), %xmm5
497 pminub %xmm2, %xmm6
498 pminub %xmm5, %xmm7
499 pminub %xmm6, %xmm7
500 pcmpeqd %xmm0, %xmm7
501 pmovmskb %xmm7, %eax
502 movaps %xmm5, %xmm7
503 palignr $12, %xmm4, %xmm5
504 test %eax, %eax
505 palignr $12, %xmm3, %xmm4
506 jnz L(Shl12Start)
507
508 palignr $12, %xmm2, %xmm3
509 lea 64(%ecx), %ecx
510 palignr $12, %xmm1, %xmm2
511 movaps %xmm7, %xmm1
512 movaps %xmm5, 48(%edx)
513 movaps %xmm4, 32(%edx)
514 movaps %xmm3, 16(%edx)
515 movaps %xmm2, (%edx)
516 lea 64(%edx), %edx
517 jmp L(Shl12LoopStart)
518
519L(Shl12LoopExit):
520 movl (%ecx), %esi
521 movl %esi, (%edx)
522 mov $4, %esi
523
524 .p2align 4
525L(CopyFrom1To16Bytes):
526 add %esi, %edx
527 add %esi, %ecx
528
529 POP (%esi)
530 test %al, %al
531 jz L(ExitHigh)
532 test $0x01, %al
533 jnz L(Exit4)
534L(Exit8):
535 movlpd (%ecx), %xmm0
536 movlpd %xmm0, (%edx)
537 movl %edi, %eax
538 RETURN
539
540 .p2align 4
541L(ExitHigh):
542 test $0x01, %ah
543 jnz L(Exit12)
544L(Exit16):
545 movdqu (%ecx), %xmm0
546 movdqu %xmm0, (%edx)
547 movl %edi, %eax
548 RETURN
549
550 .p2align 4
551L(Exit4):
552 movl (%ecx), %eax
553 movl %eax, (%edx)
554 movl %edi, %eax
555 RETURN
556
557 .p2align 4
558L(Exit12):
559 movlpd (%ecx), %xmm0
560 movlpd %xmm0, (%edx)
561 movl 8(%ecx), %eax
562 movl %eax, 8(%edx)
563 movl %edi, %eax
564 RETURN
565
566CFI_POP (%edi)
567
568 .p2align 4
569L(ExitTail4):
570 movl (%ecx), %eax
571 movl %eax, (%edx)
572 movl %edx, %eax
573 ret
574
575 .p2align 4
576L(ExitTail8):
577 movlpd (%ecx), %xmm0
578 movlpd %xmm0, (%edx)
579 movl %edx, %eax
580 ret
581
582 .p2align 4
583L(ExitTail12):
584 movlpd (%ecx), %xmm0
585 movlpd %xmm0, (%edx)
586 movl 8(%ecx), %eax
587 movl %eax, 8(%edx)
588 movl %edx, %eax
589 ret
590
591 .p2align 4
592L(ExitTail16):
593 movdqu (%ecx), %xmm0
594 movdqu %xmm0, (%edx)
595 movl %edx, %eax
596 ret
597
598END (__wcscpy_ssse3)
599#endif
600

source code of glibc/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S