1 | /* wcscpy with SSSE3 |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | # include <sysdep.h> |
21 | |
22 | # define CFI_PUSH(REG) \ |
23 | cfi_adjust_cfa_offset (4); \ |
24 | cfi_rel_offset (REG, 0) |
25 | |
26 | # define CFI_POP(REG) \ |
27 | cfi_adjust_cfa_offset (-4); \ |
28 | cfi_restore (REG) |
29 | |
30 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
31 | # define POP(REG) popl REG; CFI_POP (REG) |
32 | |
33 | # define PARMS 4 |
34 | # define RETURN POP (%edi); ret; CFI_PUSH (%edi) |
35 | # define STR1 PARMS |
36 | # define STR2 STR1+4 |
37 | # define LEN STR2+4 |
38 | |
39 | atom_text_section |
40 | ENTRY (__wcscpy_ssse3) |
41 | mov STR1(%esp), %edx |
42 | mov STR2(%esp), %ecx |
43 | |
44 | cmpl $0, (%ecx) |
45 | jz L(ExitTail4) |
46 | cmpl $0, 4(%ecx) |
47 | jz L(ExitTail8) |
48 | cmpl $0, 8(%ecx) |
49 | jz L(ExitTail12) |
50 | cmpl $0, 12(%ecx) |
51 | jz L(ExitTail16) |
52 | |
53 | PUSH (%edi) |
54 | mov %edx, %edi |
55 | PUSH (%esi) |
56 | lea 16(%ecx), %esi |
57 | |
58 | and $-16, %esi |
59 | |
60 | pxor %xmm0, %xmm0 |
61 | pcmpeqd (%esi), %xmm0 |
62 | movdqu (%ecx), %xmm1 |
63 | movdqu %xmm1, (%edx) |
64 | |
65 | pmovmskb %xmm0, %eax |
66 | sub %ecx, %esi |
67 | |
68 | test %eax, %eax |
69 | jnz L(CopyFrom1To16Bytes) |
70 | |
71 | mov %edx, %eax |
72 | lea 16(%edx), %edx |
73 | and $-16, %edx |
74 | sub %edx, %eax |
75 | |
76 | sub %eax, %ecx |
77 | mov %ecx, %eax |
78 | and $0xf, %eax |
79 | mov $0, %esi |
80 | |
81 | jz L(Align16Both) |
82 | cmp $4, %eax |
83 | je L(Shl4) |
84 | cmp $8, %eax |
85 | je L(Shl8) |
86 | jmp L(Shl12) |
87 | |
88 | L(Align16Both): |
89 | movaps (%ecx), %xmm1 |
90 | movaps 16(%ecx), %xmm2 |
91 | movaps %xmm1, (%edx) |
92 | pcmpeqd %xmm2, %xmm0 |
93 | pmovmskb %xmm0, %eax |
94 | lea 16(%esi), %esi |
95 | |
96 | test %eax, %eax |
97 | jnz L(CopyFrom1To16Bytes) |
98 | |
99 | movaps 16(%ecx, %esi), %xmm3 |
100 | movaps %xmm2, (%edx, %esi) |
101 | pcmpeqd %xmm3, %xmm0 |
102 | pmovmskb %xmm0, %eax |
103 | lea 16(%esi), %esi |
104 | |
105 | test %eax, %eax |
106 | jnz L(CopyFrom1To16Bytes) |
107 | |
108 | movaps 16(%ecx, %esi), %xmm4 |
109 | movaps %xmm3, (%edx, %esi) |
110 | pcmpeqd %xmm4, %xmm0 |
111 | pmovmskb %xmm0, %eax |
112 | lea 16(%esi), %esi |
113 | |
114 | test %eax, %eax |
115 | jnz L(CopyFrom1To16Bytes) |
116 | |
117 | movaps 16(%ecx, %esi), %xmm1 |
118 | movaps %xmm4, (%edx, %esi) |
119 | pcmpeqd %xmm1, %xmm0 |
120 | pmovmskb %xmm0, %eax |
121 | lea 16(%esi), %esi |
122 | |
123 | test %eax, %eax |
124 | jnz L(CopyFrom1To16Bytes) |
125 | |
126 | movaps 16(%ecx, %esi), %xmm2 |
127 | movaps %xmm1, (%edx, %esi) |
128 | pcmpeqd %xmm2, %xmm0 |
129 | pmovmskb %xmm0, %eax |
130 | lea 16(%esi), %esi |
131 | |
132 | test %eax, %eax |
133 | jnz L(CopyFrom1To16Bytes) |
134 | |
135 | movaps 16(%ecx, %esi), %xmm3 |
136 | movaps %xmm2, (%edx, %esi) |
137 | pcmpeqd %xmm3, %xmm0 |
138 | pmovmskb %xmm0, %eax |
139 | lea 16(%esi), %esi |
140 | |
141 | test %eax, %eax |
142 | jnz L(CopyFrom1To16Bytes) |
143 | |
144 | movaps %xmm3, (%edx, %esi) |
145 | mov %ecx, %eax |
146 | lea 16(%ecx, %esi), %ecx |
147 | and $-0x40, %ecx |
148 | sub %ecx, %eax |
149 | sub %eax, %edx |
150 | |
151 | mov $-0x40, %esi |
152 | |
153 | L(Aligned64Loop): |
154 | movaps (%ecx), %xmm2 |
155 | movaps 32(%ecx), %xmm3 |
156 | movaps %xmm2, %xmm4 |
157 | movaps 16(%ecx), %xmm5 |
158 | movaps %xmm3, %xmm6 |
159 | movaps 48(%ecx), %xmm7 |
160 | pminub %xmm5, %xmm2 |
161 | pminub %xmm7, %xmm3 |
162 | pminub %xmm2, %xmm3 |
163 | lea 64(%edx), %edx |
164 | pcmpeqd %xmm0, %xmm3 |
165 | lea 64(%ecx), %ecx |
166 | pmovmskb %xmm3, %eax |
167 | |
168 | test %eax, %eax |
169 | jnz L(Aligned64Leave) |
170 | movaps %xmm4, -64(%edx) |
171 | movaps %xmm5, -48(%edx) |
172 | movaps %xmm6, -32(%edx) |
173 | movaps %xmm7, -16(%edx) |
174 | jmp L(Aligned64Loop) |
175 | |
176 | L(Aligned64Leave): |
177 | pcmpeqd %xmm4, %xmm0 |
178 | pmovmskb %xmm0, %eax |
179 | test %eax, %eax |
180 | jnz L(CopyFrom1To16Bytes) |
181 | |
182 | pcmpeqd %xmm5, %xmm0 |
183 | pmovmskb %xmm0, %eax |
184 | movaps %xmm4, -64(%edx) |
185 | test %eax, %eax |
186 | lea 16(%esi), %esi |
187 | jnz L(CopyFrom1To16Bytes) |
188 | |
189 | pcmpeqd %xmm6, %xmm0 |
190 | pmovmskb %xmm0, %eax |
191 | movaps %xmm5, -48(%edx) |
192 | test %eax, %eax |
193 | lea 16(%esi), %esi |
194 | jnz L(CopyFrom1To16Bytes) |
195 | |
196 | movaps %xmm6, -32(%edx) |
197 | pcmpeqd %xmm7, %xmm0 |
198 | pmovmskb %xmm0, %eax |
199 | test %eax, %eax |
200 | lea 16(%esi), %esi |
201 | jnz L(CopyFrom1To16Bytes) |
202 | |
203 | mov $-0x40, %esi |
204 | movaps %xmm7, -16(%edx) |
205 | jmp L(Aligned64Loop) |
206 | |
207 | .p2align 4 |
208 | L(Shl4): |
209 | movaps -4(%ecx), %xmm1 |
210 | movaps 12(%ecx), %xmm2 |
211 | L(Shl4Start): |
212 | pcmpeqd %xmm2, %xmm0 |
213 | pmovmskb %xmm0, %eax |
214 | movaps %xmm2, %xmm3 |
215 | |
216 | test %eax, %eax |
217 | jnz L(Shl4LoopExit) |
218 | |
219 | palignr $4, %xmm1, %xmm2 |
220 | movaps %xmm2, (%edx) |
221 | movaps 28(%ecx), %xmm2 |
222 | |
223 | pcmpeqd %xmm2, %xmm0 |
224 | lea 16(%edx), %edx |
225 | pmovmskb %xmm0, %eax |
226 | lea 16(%ecx), %ecx |
227 | movaps %xmm2, %xmm1 |
228 | |
229 | test %eax, %eax |
230 | jnz L(Shl4LoopExit) |
231 | |
232 | palignr $4, %xmm3, %xmm2 |
233 | movaps %xmm2, (%edx) |
234 | movaps 28(%ecx), %xmm2 |
235 | |
236 | pcmpeqd %xmm2, %xmm0 |
237 | lea 16(%edx), %edx |
238 | pmovmskb %xmm0, %eax |
239 | lea 16(%ecx), %ecx |
240 | movaps %xmm2, %xmm3 |
241 | |
242 | test %eax, %eax |
243 | jnz L(Shl4LoopExit) |
244 | |
245 | palignr $4, %xmm1, %xmm2 |
246 | movaps %xmm2, (%edx) |
247 | movaps 28(%ecx), %xmm2 |
248 | |
249 | pcmpeqd %xmm2, %xmm0 |
250 | lea 16(%edx), %edx |
251 | pmovmskb %xmm0, %eax |
252 | lea 16(%ecx), %ecx |
253 | |
254 | test %eax, %eax |
255 | jnz L(Shl4LoopExit) |
256 | |
257 | palignr $4, %xmm3, %xmm2 |
258 | movaps %xmm2, (%edx) |
259 | lea 28(%ecx), %ecx |
260 | lea 16(%edx), %edx |
261 | |
262 | mov %ecx, %eax |
263 | and $-0x40, %ecx |
264 | sub %ecx, %eax |
265 | lea -12(%ecx), %ecx |
266 | sub %eax, %edx |
267 | |
268 | movaps -4(%ecx), %xmm1 |
269 | |
270 | L(Shl4LoopStart): |
271 | movaps 12(%ecx), %xmm2 |
272 | movaps 28(%ecx), %xmm3 |
273 | movaps %xmm3, %xmm6 |
274 | movaps 44(%ecx), %xmm4 |
275 | movaps %xmm4, %xmm7 |
276 | movaps 60(%ecx), %xmm5 |
277 | pminub %xmm2, %xmm6 |
278 | pminub %xmm5, %xmm7 |
279 | pminub %xmm6, %xmm7 |
280 | pcmpeqd %xmm0, %xmm7 |
281 | pmovmskb %xmm7, %eax |
282 | movaps %xmm5, %xmm7 |
283 | palignr $4, %xmm4, %xmm5 |
284 | test %eax, %eax |
285 | palignr $4, %xmm3, %xmm4 |
286 | jnz L(Shl4Start) |
287 | |
288 | palignr $4, %xmm2, %xmm3 |
289 | lea 64(%ecx), %ecx |
290 | palignr $4, %xmm1, %xmm2 |
291 | movaps %xmm7, %xmm1 |
292 | movaps %xmm5, 48(%edx) |
293 | movaps %xmm4, 32(%edx) |
294 | movaps %xmm3, 16(%edx) |
295 | movaps %xmm2, (%edx) |
296 | lea 64(%edx), %edx |
297 | jmp L(Shl4LoopStart) |
298 | |
299 | L(Shl4LoopExit): |
300 | movlpd (%ecx), %xmm0 |
301 | movl 8(%ecx), %esi |
302 | movlpd %xmm0, (%edx) |
303 | movl %esi, 8(%edx) |
304 | POP (%esi) |
305 | add $12, %edx |
306 | add $12, %ecx |
307 | test %al, %al |
308 | jz L(ExitHigh) |
309 | test $0x01, %al |
310 | jnz L(Exit4) |
311 | movlpd (%ecx), %xmm0 |
312 | movlpd %xmm0, (%edx) |
313 | movl %edi, %eax |
314 | RETURN |
315 | |
316 | CFI_PUSH (%esi) |
317 | |
318 | .p2align 4 |
319 | L(Shl8): |
320 | movaps -8(%ecx), %xmm1 |
321 | movaps 8(%ecx), %xmm2 |
322 | L(Shl8Start): |
323 | pcmpeqd %xmm2, %xmm0 |
324 | pmovmskb %xmm0, %eax |
325 | movaps %xmm2, %xmm3 |
326 | |
327 | test %eax, %eax |
328 | jnz L(Shl8LoopExit) |
329 | |
330 | palignr $8, %xmm1, %xmm2 |
331 | movaps %xmm2, (%edx) |
332 | movaps 24(%ecx), %xmm2 |
333 | |
334 | pcmpeqd %xmm2, %xmm0 |
335 | lea 16(%edx), %edx |
336 | pmovmskb %xmm0, %eax |
337 | lea 16(%ecx), %ecx |
338 | movaps %xmm2, %xmm1 |
339 | |
340 | test %eax, %eax |
341 | jnz L(Shl8LoopExit) |
342 | |
343 | palignr $8, %xmm3, %xmm2 |
344 | movaps %xmm2, (%edx) |
345 | movaps 24(%ecx), %xmm2 |
346 | |
347 | pcmpeqd %xmm2, %xmm0 |
348 | lea 16(%edx), %edx |
349 | pmovmskb %xmm0, %eax |
350 | lea 16(%ecx), %ecx |
351 | movaps %xmm2, %xmm3 |
352 | |
353 | test %eax, %eax |
354 | jnz L(Shl8LoopExit) |
355 | |
356 | palignr $8, %xmm1, %xmm2 |
357 | movaps %xmm2, (%edx) |
358 | movaps 24(%ecx), %xmm2 |
359 | |
360 | pcmpeqd %xmm2, %xmm0 |
361 | lea 16(%edx), %edx |
362 | pmovmskb %xmm0, %eax |
363 | lea 16(%ecx), %ecx |
364 | |
365 | test %eax, %eax |
366 | jnz L(Shl8LoopExit) |
367 | |
368 | palignr $8, %xmm3, %xmm2 |
369 | movaps %xmm2, (%edx) |
370 | lea 24(%ecx), %ecx |
371 | lea 16(%edx), %edx |
372 | |
373 | mov %ecx, %eax |
374 | and $-0x40, %ecx |
375 | sub %ecx, %eax |
376 | lea -8(%ecx), %ecx |
377 | sub %eax, %edx |
378 | |
379 | movaps -8(%ecx), %xmm1 |
380 | |
381 | L(Shl8LoopStart): |
382 | movaps 8(%ecx), %xmm2 |
383 | movaps 24(%ecx), %xmm3 |
384 | movaps %xmm3, %xmm6 |
385 | movaps 40(%ecx), %xmm4 |
386 | movaps %xmm4, %xmm7 |
387 | movaps 56(%ecx), %xmm5 |
388 | pminub %xmm2, %xmm6 |
389 | pminub %xmm5, %xmm7 |
390 | pminub %xmm6, %xmm7 |
391 | pcmpeqd %xmm0, %xmm7 |
392 | pmovmskb %xmm7, %eax |
393 | movaps %xmm5, %xmm7 |
394 | palignr $8, %xmm4, %xmm5 |
395 | test %eax, %eax |
396 | palignr $8, %xmm3, %xmm4 |
397 | jnz L(Shl8Start) |
398 | |
399 | palignr $8, %xmm2, %xmm3 |
400 | lea 64(%ecx), %ecx |
401 | palignr $8, %xmm1, %xmm2 |
402 | movaps %xmm7, %xmm1 |
403 | movaps %xmm5, 48(%edx) |
404 | movaps %xmm4, 32(%edx) |
405 | movaps %xmm3, 16(%edx) |
406 | movaps %xmm2, (%edx) |
407 | lea 64(%edx), %edx |
408 | jmp L(Shl8LoopStart) |
409 | |
410 | L(Shl8LoopExit): |
411 | movlpd (%ecx), %xmm0 |
412 | movlpd %xmm0, (%edx) |
413 | POP (%esi) |
414 | add $8, %edx |
415 | add $8, %ecx |
416 | test %al, %al |
417 | jz L(ExitHigh) |
418 | test $0x01, %al |
419 | jnz L(Exit4) |
420 | movlpd (%ecx), %xmm0 |
421 | movlpd %xmm0, (%edx) |
422 | movl %edi, %eax |
423 | RETURN |
424 | |
425 | CFI_PUSH (%esi) |
426 | |
427 | .p2align 4 |
428 | L(Shl12): |
429 | movaps -12(%ecx), %xmm1 |
430 | movaps 4(%ecx), %xmm2 |
431 | L(Shl12Start): |
432 | pcmpeqd %xmm2, %xmm0 |
433 | pmovmskb %xmm0, %eax |
434 | movaps %xmm2, %xmm3 |
435 | |
436 | test %eax, %eax |
437 | jnz L(Shl12LoopExit) |
438 | |
439 | palignr $12, %xmm1, %xmm2 |
440 | movaps %xmm2, (%edx) |
441 | movaps 20(%ecx), %xmm2 |
442 | |
443 | pcmpeqd %xmm2, %xmm0 |
444 | lea 16(%edx), %edx |
445 | pmovmskb %xmm0, %eax |
446 | lea 16(%ecx), %ecx |
447 | movaps %xmm2, %xmm1 |
448 | |
449 | test %eax, %eax |
450 | jnz L(Shl12LoopExit) |
451 | |
452 | palignr $12, %xmm3, %xmm2 |
453 | movaps %xmm2, (%edx) |
454 | movaps 20(%ecx), %xmm2 |
455 | |
456 | pcmpeqd %xmm2, %xmm0 |
457 | lea 16(%edx), %edx |
458 | pmovmskb %xmm0, %eax |
459 | lea 16(%ecx), %ecx |
460 | movaps %xmm2, %xmm3 |
461 | |
462 | test %eax, %eax |
463 | jnz L(Shl12LoopExit) |
464 | |
465 | palignr $12, %xmm1, %xmm2 |
466 | movaps %xmm2, (%edx) |
467 | movaps 20(%ecx), %xmm2 |
468 | |
469 | pcmpeqd %xmm2, %xmm0 |
470 | lea 16(%edx), %edx |
471 | pmovmskb %xmm0, %eax |
472 | lea 16(%ecx), %ecx |
473 | |
474 | test %eax, %eax |
475 | jnz L(Shl12LoopExit) |
476 | |
477 | palignr $12, %xmm3, %xmm2 |
478 | movaps %xmm2, (%edx) |
479 | lea 20(%ecx), %ecx |
480 | lea 16(%edx), %edx |
481 | |
482 | mov %ecx, %eax |
483 | and $-0x40, %ecx |
484 | sub %ecx, %eax |
485 | lea -4(%ecx), %ecx |
486 | sub %eax, %edx |
487 | |
488 | movaps -12(%ecx), %xmm1 |
489 | |
490 | L(Shl12LoopStart): |
491 | movaps 4(%ecx), %xmm2 |
492 | movaps 20(%ecx), %xmm3 |
493 | movaps %xmm3, %xmm6 |
494 | movaps 36(%ecx), %xmm4 |
495 | movaps %xmm4, %xmm7 |
496 | movaps 52(%ecx), %xmm5 |
497 | pminub %xmm2, %xmm6 |
498 | pminub %xmm5, %xmm7 |
499 | pminub %xmm6, %xmm7 |
500 | pcmpeqd %xmm0, %xmm7 |
501 | pmovmskb %xmm7, %eax |
502 | movaps %xmm5, %xmm7 |
503 | palignr $12, %xmm4, %xmm5 |
504 | test %eax, %eax |
505 | palignr $12, %xmm3, %xmm4 |
506 | jnz L(Shl12Start) |
507 | |
508 | palignr $12, %xmm2, %xmm3 |
509 | lea 64(%ecx), %ecx |
510 | palignr $12, %xmm1, %xmm2 |
511 | movaps %xmm7, %xmm1 |
512 | movaps %xmm5, 48(%edx) |
513 | movaps %xmm4, 32(%edx) |
514 | movaps %xmm3, 16(%edx) |
515 | movaps %xmm2, (%edx) |
516 | lea 64(%edx), %edx |
517 | jmp L(Shl12LoopStart) |
518 | |
519 | L(Shl12LoopExit): |
520 | movl (%ecx), %esi |
521 | movl %esi, (%edx) |
522 | mov $4, %esi |
523 | |
524 | .p2align 4 |
525 | L(CopyFrom1To16Bytes): |
526 | add %esi, %edx |
527 | add %esi, %ecx |
528 | |
529 | POP (%esi) |
530 | test %al, %al |
531 | jz L(ExitHigh) |
532 | test $0x01, %al |
533 | jnz L(Exit4) |
534 | L(Exit8): |
535 | movlpd (%ecx), %xmm0 |
536 | movlpd %xmm0, (%edx) |
537 | movl %edi, %eax |
538 | RETURN |
539 | |
540 | .p2align 4 |
541 | L(ExitHigh): |
542 | test $0x01, %ah |
543 | jnz L(Exit12) |
544 | L(Exit16): |
545 | movdqu (%ecx), %xmm0 |
546 | movdqu %xmm0, (%edx) |
547 | movl %edi, %eax |
548 | RETURN |
549 | |
550 | .p2align 4 |
551 | L(Exit4): |
552 | movl (%ecx), %eax |
553 | movl %eax, (%edx) |
554 | movl %edi, %eax |
555 | RETURN |
556 | |
557 | .p2align 4 |
558 | L(Exit12): |
559 | movlpd (%ecx), %xmm0 |
560 | movlpd %xmm0, (%edx) |
561 | movl 8(%ecx), %eax |
562 | movl %eax, 8(%edx) |
563 | movl %edi, %eax |
564 | RETURN |
565 | |
566 | CFI_POP (%edi) |
567 | |
568 | .p2align 4 |
569 | L(ExitTail4): |
570 | movl (%ecx), %eax |
571 | movl %eax, (%edx) |
572 | movl %edx, %eax |
573 | ret |
574 | |
575 | .p2align 4 |
576 | L(ExitTail8): |
577 | movlpd (%ecx), %xmm0 |
578 | movlpd %xmm0, (%edx) |
579 | movl %edx, %eax |
580 | ret |
581 | |
582 | .p2align 4 |
583 | L(ExitTail12): |
584 | movlpd (%ecx), %xmm0 |
585 | movlpd %xmm0, (%edx) |
586 | movl 8(%ecx), %eax |
587 | movl %eax, 8(%edx) |
588 | movl %edx, %eax |
589 | ret |
590 | |
591 | .p2align 4 |
592 | L(ExitTail16): |
593 | movdqu (%ecx), %xmm0 |
594 | movdqu %xmm0, (%edx) |
595 | movl %edx, %eax |
596 | ret |
597 | |
598 | END (__wcscpy_ssse3) |
599 | #endif |
600 | |