1 | /* wcscpy with SSSE3 |
2 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | # include <sysdep.h> |
21 | |
22 | .section .text.ssse3,"ax" ,@progbits |
23 | ENTRY (__wcscpy_ssse3) |
24 | |
25 | mov %rsi, %rcx |
26 | mov %rdi, %rdx |
27 | |
28 | cmpl $0, (%rcx) |
29 | jz L(Exit4) |
30 | cmpl $0, 4(%rcx) |
31 | jz L(Exit8) |
32 | cmpl $0, 8(%rcx) |
33 | jz L(Exit12) |
34 | cmpl $0, 12(%rcx) |
35 | jz L(Exit16) |
36 | |
37 | lea 16(%rcx), %rsi |
38 | and $-16, %rsi |
39 | |
40 | pxor %xmm0, %xmm0 |
41 | mov (%rcx), %r9 |
42 | mov %r9, (%rdx) |
43 | |
44 | pcmpeqd (%rsi), %xmm0 |
45 | mov 8(%rcx), %r9 |
46 | mov %r9, 8(%rdx) |
47 | |
48 | pmovmskb %xmm0, %rax |
49 | sub %rcx, %rsi |
50 | |
51 | test %rax, %rax |
52 | jnz L(CopyFrom1To16Bytes) |
53 | |
54 | mov %rdx, %rax |
55 | lea 16(%rdx), %rdx |
56 | and $-16, %rdx |
57 | sub %rdx, %rax |
58 | sub %rax, %rcx |
59 | mov %rcx, %rax |
60 | and $0xf, %rax |
61 | mov $0, %rsi |
62 | |
63 | /* case: rcx_offset == rdx_offset */ |
64 | |
65 | jz L(Align16Both) |
66 | |
67 | cmp $4, %rax |
68 | je L(Shl4) |
69 | cmp $8, %rax |
70 | je L(Shl8) |
71 | jmp L(Shl12) |
72 | |
73 | L(Align16Both): |
74 | movaps (%rcx), %xmm1 |
75 | movaps 16(%rcx), %xmm2 |
76 | movaps %xmm1, (%rdx) |
77 | pcmpeqd %xmm2, %xmm0 |
78 | pmovmskb %xmm0, %rax |
79 | lea 16(%rsi), %rsi |
80 | |
81 | test %rax, %rax |
82 | jnz L(CopyFrom1To16Bytes) |
83 | |
84 | movaps 16(%rcx, %rsi), %xmm3 |
85 | movaps %xmm2, (%rdx, %rsi) |
86 | pcmpeqd %xmm3, %xmm0 |
87 | pmovmskb %xmm0, %rax |
88 | lea 16(%rsi), %rsi |
89 | |
90 | test %rax, %rax |
91 | jnz L(CopyFrom1To16Bytes) |
92 | |
93 | movaps 16(%rcx, %rsi), %xmm4 |
94 | movaps %xmm3, (%rdx, %rsi) |
95 | pcmpeqd %xmm4, %xmm0 |
96 | pmovmskb %xmm0, %rax |
97 | lea 16(%rsi), %rsi |
98 | |
99 | test %rax, %rax |
100 | jnz L(CopyFrom1To16Bytes) |
101 | |
102 | movaps 16(%rcx, %rsi), %xmm1 |
103 | movaps %xmm4, (%rdx, %rsi) |
104 | pcmpeqd %xmm1, %xmm0 |
105 | pmovmskb %xmm0, %rax |
106 | lea 16(%rsi), %rsi |
107 | |
108 | test %rax, %rax |
109 | jnz L(CopyFrom1To16Bytes) |
110 | |
111 | movaps 16(%rcx, %rsi), %xmm2 |
112 | movaps %xmm1, (%rdx, %rsi) |
113 | pcmpeqd %xmm2, %xmm0 |
114 | pmovmskb %xmm0, %rax |
115 | lea 16(%rsi), %rsi |
116 | |
117 | test %rax, %rax |
118 | jnz L(CopyFrom1To16Bytes) |
119 | |
120 | movaps 16(%rcx, %rsi), %xmm3 |
121 | movaps %xmm2, (%rdx, %rsi) |
122 | pcmpeqd %xmm3, %xmm0 |
123 | pmovmskb %xmm0, %rax |
124 | lea 16(%rsi), %rsi |
125 | |
126 | test %rax, %rax |
127 | jnz L(CopyFrom1To16Bytes) |
128 | |
129 | movaps %xmm3, (%rdx, %rsi) |
130 | mov %rcx, %rax |
131 | lea 16(%rcx, %rsi), %rcx |
132 | and $-0x40, %rcx |
133 | sub %rcx, %rax |
134 | sub %rax, %rdx |
135 | |
136 | mov $-0x40, %rsi |
137 | |
138 | .p2align 4 |
139 | L(Aligned64Loop): |
140 | movaps (%rcx), %xmm2 |
141 | movaps %xmm2, %xmm4 |
142 | movaps 16(%rcx), %xmm5 |
143 | movaps 32(%rcx), %xmm3 |
144 | movaps %xmm3, %xmm6 |
145 | movaps 48(%rcx), %xmm7 |
146 | pminub %xmm5, %xmm2 |
147 | pminub %xmm7, %xmm3 |
148 | pminub %xmm2, %xmm3 |
149 | pcmpeqd %xmm0, %xmm3 |
150 | pmovmskb %xmm3, %rax |
151 | lea 64(%rdx), %rdx |
152 | lea 64(%rcx), %rcx |
153 | test %rax, %rax |
154 | jnz L(Aligned64Leave) |
155 | movaps %xmm4, -64(%rdx) |
156 | movaps %xmm5, -48(%rdx) |
157 | movaps %xmm6, -32(%rdx) |
158 | movaps %xmm7, -16(%rdx) |
159 | jmp L(Aligned64Loop) |
160 | |
161 | L(Aligned64Leave): |
162 | pcmpeqd %xmm4, %xmm0 |
163 | pmovmskb %xmm0, %rax |
164 | test %rax, %rax |
165 | jnz L(CopyFrom1To16Bytes) |
166 | |
167 | pcmpeqd %xmm5, %xmm0 |
168 | |
169 | pmovmskb %xmm0, %rax |
170 | movaps %xmm4, -64(%rdx) |
171 | test %rax, %rax |
172 | lea 16(%rsi), %rsi |
173 | jnz L(CopyFrom1To16Bytes) |
174 | |
175 | pcmpeqd %xmm6, %xmm0 |
176 | |
177 | pmovmskb %xmm0, %rax |
178 | movaps %xmm5, -48(%rdx) |
179 | test %rax, %rax |
180 | lea 16(%rsi), %rsi |
181 | jnz L(CopyFrom1To16Bytes) |
182 | |
183 | movaps %xmm6, -32(%rdx) |
184 | pcmpeqd %xmm7, %xmm0 |
185 | |
186 | pmovmskb %xmm0, %rax |
187 | lea 16(%rsi), %rsi |
188 | test %rax, %rax |
189 | jnz L(CopyFrom1To16Bytes) |
190 | |
191 | mov $-0x40, %rsi |
192 | movaps %xmm7, -16(%rdx) |
193 | jmp L(Aligned64Loop) |
194 | |
195 | .p2align 4 |
196 | L(Shl4): |
197 | movaps -4(%rcx), %xmm1 |
198 | movaps 12(%rcx), %xmm2 |
199 | L(Shl4Start): |
200 | pcmpeqd %xmm2, %xmm0 |
201 | pmovmskb %xmm0, %rax |
202 | movaps %xmm2, %xmm3 |
203 | |
204 | test %rax, %rax |
205 | jnz L(Shl4LoopExit) |
206 | |
207 | palignr $4, %xmm1, %xmm2 |
208 | movaps %xmm2, (%rdx) |
209 | movaps 28(%rcx), %xmm2 |
210 | |
211 | pcmpeqd %xmm2, %xmm0 |
212 | lea 16(%rdx), %rdx |
213 | pmovmskb %xmm0, %rax |
214 | lea 16(%rcx), %rcx |
215 | movaps %xmm2, %xmm1 |
216 | |
217 | test %rax, %rax |
218 | jnz L(Shl4LoopExit) |
219 | |
220 | palignr $4, %xmm3, %xmm2 |
221 | movaps %xmm2, (%rdx) |
222 | movaps 28(%rcx), %xmm2 |
223 | |
224 | pcmpeqd %xmm2, %xmm0 |
225 | lea 16(%rdx), %rdx |
226 | pmovmskb %xmm0, %rax |
227 | lea 16(%rcx), %rcx |
228 | movaps %xmm2, %xmm3 |
229 | |
230 | test %rax, %rax |
231 | jnz L(Shl4LoopExit) |
232 | |
233 | palignr $4, %xmm1, %xmm2 |
234 | movaps %xmm2, (%rdx) |
235 | movaps 28(%rcx), %xmm2 |
236 | |
237 | pcmpeqd %xmm2, %xmm0 |
238 | lea 16(%rdx), %rdx |
239 | pmovmskb %xmm0, %rax |
240 | lea 16(%rcx), %rcx |
241 | |
242 | test %rax, %rax |
243 | jnz L(Shl4LoopExit) |
244 | |
245 | palignr $4, %xmm3, %xmm2 |
246 | movaps %xmm2, (%rdx) |
247 | lea 28(%rcx), %rcx |
248 | lea 16(%rdx), %rdx |
249 | |
250 | mov %rcx, %rax |
251 | and $-0x40, %rcx |
252 | sub %rcx, %rax |
253 | lea -12(%rcx), %rcx |
254 | sub %rax, %rdx |
255 | |
256 | movaps -4(%rcx), %xmm1 |
257 | |
258 | .p2align 4 |
259 | L(Shl4LoopStart): |
260 | movaps 12(%rcx), %xmm2 |
261 | movaps 28(%rcx), %xmm3 |
262 | movaps %xmm3, %xmm6 |
263 | movaps 44(%rcx), %xmm4 |
264 | movaps %xmm4, %xmm7 |
265 | movaps 60(%rcx), %xmm5 |
266 | pminub %xmm2, %xmm6 |
267 | pminub %xmm5, %xmm7 |
268 | pminub %xmm6, %xmm7 |
269 | pcmpeqd %xmm0, %xmm7 |
270 | pmovmskb %xmm7, %rax |
271 | movaps %xmm5, %xmm7 |
272 | palignr $4, %xmm4, %xmm5 |
273 | test %rax, %rax |
274 | palignr $4, %xmm3, %xmm4 |
275 | jnz L(Shl4Start) |
276 | |
277 | palignr $4, %xmm2, %xmm3 |
278 | lea 64(%rcx), %rcx |
279 | palignr $4, %xmm1, %xmm2 |
280 | movaps %xmm7, %xmm1 |
281 | movaps %xmm5, 48(%rdx) |
282 | movaps %xmm4, 32(%rdx) |
283 | movaps %xmm3, 16(%rdx) |
284 | movaps %xmm2, (%rdx) |
285 | lea 64(%rdx), %rdx |
286 | jmp L(Shl4LoopStart) |
287 | |
288 | L(Shl4LoopExit): |
289 | movdqu -4(%rcx), %xmm1 |
290 | mov $12, %rsi |
291 | movdqu %xmm1, -4(%rdx) |
292 | jmp L(CopyFrom1To16Bytes) |
293 | |
294 | .p2align 4 |
295 | L(Shl8): |
296 | movaps -8(%rcx), %xmm1 |
297 | movaps 8(%rcx), %xmm2 |
298 | L(Shl8Start): |
299 | pcmpeqd %xmm2, %xmm0 |
300 | pmovmskb %xmm0, %rax |
301 | movaps %xmm2, %xmm3 |
302 | |
303 | test %rax, %rax |
304 | jnz L(Shl8LoopExit) |
305 | |
306 | palignr $8, %xmm1, %xmm2 |
307 | movaps %xmm2, (%rdx) |
308 | movaps 24(%rcx), %xmm2 |
309 | |
310 | pcmpeqd %xmm2, %xmm0 |
311 | lea 16(%rdx), %rdx |
312 | pmovmskb %xmm0, %rax |
313 | lea 16(%rcx), %rcx |
314 | movaps %xmm2, %xmm1 |
315 | |
316 | test %rax, %rax |
317 | jnz L(Shl8LoopExit) |
318 | |
319 | palignr $8, %xmm3, %xmm2 |
320 | movaps %xmm2, (%rdx) |
321 | movaps 24(%rcx), %xmm2 |
322 | |
323 | pcmpeqd %xmm2, %xmm0 |
324 | lea 16(%rdx), %rdx |
325 | pmovmskb %xmm0, %rax |
326 | lea 16(%rcx), %rcx |
327 | movaps %xmm2, %xmm3 |
328 | |
329 | test %rax, %rax |
330 | jnz L(Shl8LoopExit) |
331 | |
332 | palignr $8, %xmm1, %xmm2 |
333 | movaps %xmm2, (%rdx) |
334 | movaps 24(%rcx), %xmm2 |
335 | |
336 | pcmpeqd %xmm2, %xmm0 |
337 | lea 16(%rdx), %rdx |
338 | pmovmskb %xmm0, %rax |
339 | lea 16(%rcx), %rcx |
340 | |
341 | test %rax, %rax |
342 | jnz L(Shl8LoopExit) |
343 | |
344 | palignr $8, %xmm3, %xmm2 |
345 | movaps %xmm2, (%rdx) |
346 | lea 24(%rcx), %rcx |
347 | lea 16(%rdx), %rdx |
348 | |
349 | mov %rcx, %rax |
350 | and $-0x40, %rcx |
351 | sub %rcx, %rax |
352 | lea -8(%rcx), %rcx |
353 | sub %rax, %rdx |
354 | |
355 | movaps -8(%rcx), %xmm1 |
356 | |
357 | .p2align 4 |
358 | L(Shl8LoopStart): |
359 | movaps 8(%rcx), %xmm2 |
360 | movaps 24(%rcx), %xmm3 |
361 | movaps %xmm3, %xmm6 |
362 | movaps 40(%rcx), %xmm4 |
363 | movaps %xmm4, %xmm7 |
364 | movaps 56(%rcx), %xmm5 |
365 | pminub %xmm2, %xmm6 |
366 | pminub %xmm5, %xmm7 |
367 | pminub %xmm6, %xmm7 |
368 | pcmpeqd %xmm0, %xmm7 |
369 | pmovmskb %xmm7, %rax |
370 | movaps %xmm5, %xmm7 |
371 | palignr $8, %xmm4, %xmm5 |
372 | test %rax, %rax |
373 | palignr $8, %xmm3, %xmm4 |
374 | jnz L(Shl8Start) |
375 | |
376 | palignr $8, %xmm2, %xmm3 |
377 | lea 64(%rcx), %rcx |
378 | palignr $8, %xmm1, %xmm2 |
379 | movaps %xmm7, %xmm1 |
380 | movaps %xmm5, 48(%rdx) |
381 | movaps %xmm4, 32(%rdx) |
382 | movaps %xmm3, 16(%rdx) |
383 | movaps %xmm2, (%rdx) |
384 | lea 64(%rdx), %rdx |
385 | jmp L(Shl8LoopStart) |
386 | |
387 | L(Shl8LoopExit): |
388 | mov (%rcx), %r9 |
389 | mov $8, %rsi |
390 | mov %r9, (%rdx) |
391 | jmp L(CopyFrom1To16Bytes) |
392 | |
393 | .p2align 4 |
394 | L(Shl12): |
395 | movaps -12(%rcx), %xmm1 |
396 | movaps 4(%rcx), %xmm2 |
397 | L(Shl12Start): |
398 | pcmpeqd %xmm2, %xmm0 |
399 | pmovmskb %xmm0, %rax |
400 | movaps %xmm2, %xmm3 |
401 | |
402 | test %rax, %rax |
403 | jnz L(Shl12LoopExit) |
404 | |
405 | palignr $12, %xmm1, %xmm2 |
406 | movaps %xmm2, (%rdx) |
407 | movaps 20(%rcx), %xmm2 |
408 | |
409 | pcmpeqd %xmm2, %xmm0 |
410 | lea 16(%rdx), %rdx |
411 | pmovmskb %xmm0, %rax |
412 | lea 16(%rcx), %rcx |
413 | movaps %xmm2, %xmm1 |
414 | |
415 | test %rax, %rax |
416 | jnz L(Shl12LoopExit) |
417 | |
418 | palignr $12, %xmm3, %xmm2 |
419 | movaps %xmm2, (%rdx) |
420 | movaps 20(%rcx), %xmm2 |
421 | |
422 | pcmpeqd %xmm2, %xmm0 |
423 | lea 16(%rdx), %rdx |
424 | pmovmskb %xmm0, %rax |
425 | lea 16(%rcx), %rcx |
426 | movaps %xmm2, %xmm3 |
427 | |
428 | test %rax, %rax |
429 | jnz L(Shl12LoopExit) |
430 | |
431 | palignr $12, %xmm1, %xmm2 |
432 | movaps %xmm2, (%rdx) |
433 | movaps 20(%rcx), %xmm2 |
434 | |
435 | pcmpeqd %xmm2, %xmm0 |
436 | lea 16(%rdx), %rdx |
437 | pmovmskb %xmm0, %rax |
438 | lea 16(%rcx), %rcx |
439 | |
440 | test %rax, %rax |
441 | jnz L(Shl12LoopExit) |
442 | |
443 | palignr $12, %xmm3, %xmm2 |
444 | movaps %xmm2, (%rdx) |
445 | lea 20(%rcx), %rcx |
446 | lea 16(%rdx), %rdx |
447 | |
448 | mov %rcx, %rax |
449 | and $-0x40, %rcx |
450 | sub %rcx, %rax |
451 | lea -4(%rcx), %rcx |
452 | sub %rax, %rdx |
453 | |
454 | movaps -12(%rcx), %xmm1 |
455 | |
456 | .p2align 4 |
457 | L(Shl12LoopStart): |
458 | movaps 4(%rcx), %xmm2 |
459 | movaps 20(%rcx), %xmm3 |
460 | movaps %xmm3, %xmm6 |
461 | movaps 36(%rcx), %xmm4 |
462 | movaps %xmm4, %xmm7 |
463 | movaps 52(%rcx), %xmm5 |
464 | pminub %xmm2, %xmm6 |
465 | pminub %xmm5, %xmm7 |
466 | pminub %xmm6, %xmm7 |
467 | pcmpeqd %xmm0, %xmm7 |
468 | pmovmskb %xmm7, %rax |
469 | movaps %xmm5, %xmm7 |
470 | palignr $12, %xmm4, %xmm5 |
471 | test %rax, %rax |
472 | palignr $12, %xmm3, %xmm4 |
473 | jnz L(Shl12Start) |
474 | palignr $12, %xmm2, %xmm3 |
475 | lea 64(%rcx), %rcx |
476 | palignr $12, %xmm1, %xmm2 |
477 | movaps %xmm7, %xmm1 |
478 | movaps %xmm5, 48(%rdx) |
479 | movaps %xmm4, 32(%rdx) |
480 | movaps %xmm3, 16(%rdx) |
481 | movaps %xmm2, (%rdx) |
482 | lea 64(%rdx), %rdx |
483 | jmp L(Shl12LoopStart) |
484 | |
485 | L(Shl12LoopExit): |
486 | mov (%rcx), %r9d |
487 | mov $4, %rsi |
488 | mov %r9d, (%rdx) |
489 | jmp L(CopyFrom1To16Bytes) |
490 | |
491 | .p2align 4 |
492 | L(CopyFrom1To16Bytes): |
493 | add %rsi, %rdx |
494 | add %rsi, %rcx |
495 | |
496 | test %al, %al |
497 | jz L(ExitHigh) |
498 | test $0x01, %al |
499 | jnz L(Exit4) |
500 | |
501 | mov (%rcx), %rax |
502 | mov %rax, (%rdx) |
503 | mov %rdi, %rax |
504 | ret |
505 | |
506 | .p2align 4 |
507 | L(ExitHigh): |
508 | test $0x01, %ah |
509 | jnz L(Exit12) |
510 | |
511 | mov (%rcx), %rax |
512 | mov %rax, (%rdx) |
513 | mov 8(%rcx), %rax |
514 | mov %rax, 8(%rdx) |
515 | mov %rdi, %rax |
516 | ret |
517 | |
518 | .p2align 4 |
519 | L(Exit4): |
520 | movl (%rcx), %eax |
521 | movl %eax, (%rdx) |
522 | mov %rdi, %rax |
523 | ret |
524 | |
525 | .p2align 4 |
526 | L(Exit8): |
527 | mov (%rcx), %rax |
528 | mov %rax, (%rdx) |
529 | mov %rdi, %rax |
530 | ret |
531 | |
532 | .p2align 4 |
533 | L(Exit12): |
534 | mov (%rcx), %rax |
535 | mov %rax, (%rdx) |
536 | mov 8(%rcx), %eax |
537 | mov %eax, 8(%rdx) |
538 | mov %rdi, %rax |
539 | ret |
540 | |
541 | .p2align 4 |
542 | L(Exit16): |
543 | mov (%rcx), %rax |
544 | mov %rax, (%rdx) |
545 | mov 8(%rcx), %rax |
546 | mov %rax, 8(%rdx) |
547 | mov %rdi, %rax |
548 | ret |
549 | |
550 | END(__wcscpy_ssse3) |
551 | #endif |
552 | |