1 | /* strcat with SSSE3 |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | |
20 | #if IS_IN (libc) |
21 | |
22 | # include <sysdep.h> |
23 | |
24 | # define CFI_PUSH(REG) \ |
25 | cfi_adjust_cfa_offset (4); \ |
26 | cfi_rel_offset (REG, 0) |
27 | |
28 | # define CFI_POP(REG) \ |
29 | cfi_adjust_cfa_offset (-4); \ |
30 | cfi_restore (REG) |
31 | |
32 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
33 | # define POP(REG) popl REG; CFI_POP (REG) |
34 | |
35 | # ifndef STRCAT |
36 | # define STRCAT __strcat_ssse3 |
37 | # endif |
38 | |
39 | # define PARMS 4 |
40 | # define STR1 PARMS+4 |
41 | # define STR2 STR1+4 |
42 | |
43 | # ifdef USE_AS_STRNCAT |
44 | # define LEN STR2+8 |
45 | # endif |
46 | |
47 | # define USE_AS_STRCAT |
48 | |
49 | .text |
50 | ENTRY (STRCAT) |
51 | PUSH (%edi) |
52 | mov STR1(%esp), %edi |
53 | mov %edi, %edx |
54 | |
55 | # define RETURN jmp L(StartStrcpyPart) |
56 | # include "strlen-sse2.S" |
57 | |
58 | L(StartStrcpyPart): |
59 | mov STR2(%esp), %ecx |
60 | lea (%edi, %eax), %edx |
61 | # ifdef USE_AS_STRNCAT |
62 | PUSH (%ebx) |
63 | mov LEN(%esp), %ebx |
64 | test %ebx, %ebx |
65 | jz L(StrncatExit0) |
66 | cmp $8, %ebx |
67 | jbe L(StrncatExit8Bytes) |
68 | # endif |
69 | cmpb $0, (%ecx) |
70 | jz L(Exit1) |
71 | cmpb $0, 1(%ecx) |
72 | jz L(Exit2) |
73 | cmpb $0, 2(%ecx) |
74 | jz L(Exit3) |
75 | cmpb $0, 3(%ecx) |
76 | jz L(Exit4) |
77 | cmpb $0, 4(%ecx) |
78 | jz L(Exit5) |
79 | cmpb $0, 5(%ecx) |
80 | jz L(Exit6) |
81 | cmpb $0, 6(%ecx) |
82 | jz L(Exit7) |
83 | cmpb $0, 7(%ecx) |
84 | jz L(Exit8) |
85 | cmpb $0, 8(%ecx) |
86 | jz L(Exit9) |
87 | # ifdef USE_AS_STRNCAT |
88 | cmp $16, %ebx |
89 | jb L(StrncatExit15Bytes) |
90 | # endif |
91 | cmpb $0, 9(%ecx) |
92 | jz L(Exit10) |
93 | cmpb $0, 10(%ecx) |
94 | jz L(Exit11) |
95 | cmpb $0, 11(%ecx) |
96 | jz L(Exit12) |
97 | cmpb $0, 12(%ecx) |
98 | jz L(Exit13) |
99 | cmpb $0, 13(%ecx) |
100 | jz L(Exit14) |
101 | cmpb $0, 14(%ecx) |
102 | jz L(Exit15) |
103 | cmpb $0, 15(%ecx) |
104 | jz L(Exit16) |
105 | # ifdef USE_AS_STRNCAT |
106 | cmp $16, %ebx |
107 | je L(StrncatExit16) |
108 | |
109 | # define RETURN1 \ |
110 | POP (%ebx); \ |
111 | POP (%edi); \ |
112 | ret; \ |
113 | CFI_PUSH (%ebx); \ |
114 | CFI_PUSH (%edi) |
115 | # define USE_AS_STRNCPY |
116 | # else |
117 | # define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) |
118 | # endif |
119 | # include "strcpy-ssse3.S" |
120 | .p2align 4 |
121 | L(CopyFrom1To16Bytes): |
122 | add %esi, %edx |
123 | add %esi, %ecx |
124 | |
125 | POP (%esi) |
126 | test %al, %al |
127 | jz L(ExitHigh) |
128 | test $0x01, %al |
129 | jnz L(Exit1) |
130 | test $0x02, %al |
131 | jnz L(Exit2) |
132 | test $0x04, %al |
133 | jnz L(Exit3) |
134 | test $0x08, %al |
135 | jnz L(Exit4) |
136 | test $0x10, %al |
137 | jnz L(Exit5) |
138 | test $0x20, %al |
139 | jnz L(Exit6) |
140 | test $0x40, %al |
141 | jnz L(Exit7) |
142 | movlpd (%ecx), %xmm0 |
143 | movlpd %xmm0, (%edx) |
144 | movl %edi, %eax |
145 | RETURN1 |
146 | |
147 | .p2align 4 |
148 | L(ExitHigh): |
149 | test $0x01, %ah |
150 | jnz L(Exit9) |
151 | test $0x02, %ah |
152 | jnz L(Exit10) |
153 | test $0x04, %ah |
154 | jnz L(Exit11) |
155 | test $0x08, %ah |
156 | jnz L(Exit12) |
157 | test $0x10, %ah |
158 | jnz L(Exit13) |
159 | test $0x20, %ah |
160 | jnz L(Exit14) |
161 | test $0x40, %ah |
162 | jnz L(Exit15) |
163 | movlpd (%ecx), %xmm0 |
164 | movlpd 8(%ecx), %xmm1 |
165 | movlpd %xmm0, (%edx) |
166 | movlpd %xmm1, 8(%edx) |
167 | movl %edi, %eax |
168 | RETURN1 |
169 | |
170 | .p2align 4 |
171 | L(StrncatExit1): |
172 | movb %bh, 1(%edx) |
173 | L(Exit1): |
174 | movb (%ecx), %al |
175 | movb %al, (%edx) |
176 | movl %edi, %eax |
177 | RETURN1 |
178 | |
179 | .p2align 4 |
180 | L(StrncatExit2): |
181 | movb %bh, 2(%edx) |
182 | L(Exit2): |
183 | movw (%ecx), %ax |
184 | movw %ax, (%edx) |
185 | movl %edi, %eax |
186 | RETURN1 |
187 | |
188 | .p2align 4 |
189 | L(StrncatExit3): |
190 | movb %bh, 3(%edx) |
191 | L(Exit3): |
192 | movw (%ecx), %ax |
193 | movw %ax, (%edx) |
194 | movb 2(%ecx), %al |
195 | movb %al, 2(%edx) |
196 | movl %edi, %eax |
197 | RETURN1 |
198 | |
199 | .p2align 4 |
200 | L(StrncatExit4): |
201 | movb %bh, 4(%edx) |
202 | L(Exit4): |
203 | movl (%ecx), %eax |
204 | movl %eax, (%edx) |
205 | movl %edi, %eax |
206 | RETURN1 |
207 | |
208 | .p2align 4 |
209 | L(StrncatExit5): |
210 | movb %bh, 5(%edx) |
211 | L(Exit5): |
212 | movl (%ecx), %eax |
213 | movl %eax, (%edx) |
214 | movb 4(%ecx), %al |
215 | movb %al, 4(%edx) |
216 | movl %edi, %eax |
217 | RETURN1 |
218 | |
219 | .p2align 4 |
220 | L(StrncatExit6): |
221 | movb %bh, 6(%edx) |
222 | L(Exit6): |
223 | movl (%ecx), %eax |
224 | movl %eax, (%edx) |
225 | movw 4(%ecx), %ax |
226 | movw %ax, 4(%edx) |
227 | movl %edi, %eax |
228 | RETURN1 |
229 | |
230 | .p2align 4 |
231 | L(StrncatExit7): |
232 | movb %bh, 7(%edx) |
233 | L(Exit7): |
234 | movl (%ecx), %eax |
235 | movl %eax, (%edx) |
236 | movl 3(%ecx), %eax |
237 | movl %eax, 3(%edx) |
238 | movl %edi, %eax |
239 | RETURN1 |
240 | |
241 | .p2align 4 |
242 | L(StrncatExit8): |
243 | movb %bh, 8(%edx) |
244 | L(Exit8): |
245 | movlpd (%ecx), %xmm0 |
246 | movlpd %xmm0, (%edx) |
247 | movl %edi, %eax |
248 | RETURN1 |
249 | |
250 | .p2align 4 |
251 | L(StrncatExit9): |
252 | movb %bh, 9(%edx) |
253 | L(Exit9): |
254 | movlpd (%ecx), %xmm0 |
255 | movlpd %xmm0, (%edx) |
256 | movb 8(%ecx), %al |
257 | movb %al, 8(%edx) |
258 | movl %edi, %eax |
259 | RETURN1 |
260 | |
261 | .p2align 4 |
262 | L(StrncatExit10): |
263 | movb %bh, 10(%edx) |
264 | L(Exit10): |
265 | movlpd (%ecx), %xmm0 |
266 | movlpd %xmm0, (%edx) |
267 | movw 8(%ecx), %ax |
268 | movw %ax, 8(%edx) |
269 | movl %edi, %eax |
270 | RETURN1 |
271 | |
272 | .p2align 4 |
273 | L(StrncatExit11): |
274 | movb %bh, 11(%edx) |
275 | L(Exit11): |
276 | movlpd (%ecx), %xmm0 |
277 | movlpd %xmm0, (%edx) |
278 | movl 7(%ecx), %eax |
279 | movl %eax, 7(%edx) |
280 | movl %edi, %eax |
281 | RETURN1 |
282 | |
283 | .p2align 4 |
284 | L(StrncatExit12): |
285 | movb %bh, 12(%edx) |
286 | L(Exit12): |
287 | movlpd (%ecx), %xmm0 |
288 | movlpd %xmm0, (%edx) |
289 | movl 8(%ecx), %eax |
290 | movl %eax, 8(%edx) |
291 | movl %edi, %eax |
292 | RETURN1 |
293 | |
294 | .p2align 4 |
295 | L(StrncatExit13): |
296 | movb %bh, 13(%edx) |
297 | L(Exit13): |
298 | movlpd (%ecx), %xmm0 |
299 | movlpd %xmm0, (%edx) |
300 | movlpd 5(%ecx), %xmm0 |
301 | movlpd %xmm0, 5(%edx) |
302 | movl %edi, %eax |
303 | RETURN1 |
304 | |
305 | .p2align 4 |
306 | L(StrncatExit14): |
307 | movb %bh, 14(%edx) |
308 | L(Exit14): |
309 | movlpd (%ecx), %xmm0 |
310 | movlpd %xmm0, (%edx) |
311 | movlpd 6(%ecx), %xmm0 |
312 | movlpd %xmm0, 6(%edx) |
313 | movl %edi, %eax |
314 | RETURN1 |
315 | |
316 | .p2align 4 |
317 | L(StrncatExit15): |
318 | movb %bh, 15(%edx) |
319 | L(Exit15): |
320 | movlpd (%ecx), %xmm0 |
321 | movlpd %xmm0, (%edx) |
322 | movlpd 7(%ecx), %xmm0 |
323 | movlpd %xmm0, 7(%edx) |
324 | movl %edi, %eax |
325 | RETURN1 |
326 | |
327 | .p2align 4 |
328 | L(StrncatExit16): |
329 | movb %bh, 16(%edx) |
330 | L(Exit16): |
331 | movlpd (%ecx), %xmm0 |
332 | movlpd 8(%ecx), %xmm1 |
333 | movlpd %xmm0, (%edx) |
334 | movlpd %xmm1, 8(%edx) |
335 | movl %edi, %eax |
336 | RETURN1 |
337 | |
338 | # ifdef USE_AS_STRNCPY |
339 | |
340 | CFI_PUSH(%esi) |
341 | |
342 | .p2align 4 |
343 | L(CopyFrom1To16BytesCase2): |
344 | add $16, %ebx |
345 | add %esi, %ecx |
346 | lea (%esi, %edx), %esi |
347 | lea -9(%ebx), %edx |
348 | and $1<<7, %dh |
349 | or %al, %dh |
350 | test %dh, %dh |
351 | lea (%esi), %edx |
352 | POP (%esi) |
353 | jz L(ExitHighCase2) |
354 | |
355 | test $0x01, %al |
356 | jnz L(Exit1) |
357 | cmp $1, %ebx |
358 | je L(StrncatExit1) |
359 | test $0x02, %al |
360 | jnz L(Exit2) |
361 | cmp $2, %ebx |
362 | je L(StrncatExit2) |
363 | test $0x04, %al |
364 | jnz L(Exit3) |
365 | cmp $3, %ebx |
366 | je L(StrncatExit3) |
367 | test $0x08, %al |
368 | jnz L(Exit4) |
369 | cmp $4, %ebx |
370 | je L(StrncatExit4) |
371 | test $0x10, %al |
372 | jnz L(Exit5) |
373 | cmp $5, %ebx |
374 | je L(StrncatExit5) |
375 | test $0x20, %al |
376 | jnz L(Exit6) |
377 | cmp $6, %ebx |
378 | je L(StrncatExit6) |
379 | test $0x40, %al |
380 | jnz L(Exit7) |
381 | cmp $7, %ebx |
382 | je L(StrncatExit7) |
383 | movlpd (%ecx), %xmm0 |
384 | movlpd %xmm0, (%edx) |
385 | lea 7(%edx), %eax |
386 | cmpb $1, (%eax) |
387 | sbb $-1, %eax |
388 | xor %cl, %cl |
389 | movb %cl, (%eax) |
390 | movl %edi, %eax |
391 | RETURN1 |
392 | |
393 | .p2align 4 |
394 | L(ExitHighCase2): |
395 | test $0x01, %ah |
396 | jnz L(Exit9) |
397 | cmp $9, %ebx |
398 | je L(StrncatExit9) |
399 | test $0x02, %ah |
400 | jnz L(Exit10) |
401 | cmp $10, %ebx |
402 | je L(StrncatExit10) |
403 | test $0x04, %ah |
404 | jnz L(Exit11) |
405 | cmp $11, %ebx |
406 | je L(StrncatExit11) |
407 | test $0x8, %ah |
408 | jnz L(Exit12) |
409 | cmp $12, %ebx |
410 | je L(StrncatExit12) |
411 | test $0x10, %ah |
412 | jnz L(Exit13) |
413 | cmp $13, %ebx |
414 | je L(StrncatExit13) |
415 | test $0x20, %ah |
416 | jnz L(Exit14) |
417 | cmp $14, %ebx |
418 | je L(StrncatExit14) |
419 | test $0x40, %ah |
420 | jnz L(Exit15) |
421 | cmp $15, %ebx |
422 | je L(StrncatExit15) |
423 | movlpd (%ecx), %xmm0 |
424 | movlpd %xmm0, (%edx) |
425 | movlpd 8(%ecx), %xmm1 |
426 | movlpd %xmm1, 8(%edx) |
427 | movl %edi, %eax |
428 | RETURN1 |
429 | |
430 | CFI_PUSH(%esi) |
431 | |
432 | L(CopyFrom1To16BytesCase2OrCase3): |
433 | test %eax, %eax |
434 | jnz L(CopyFrom1To16BytesCase2) |
435 | |
436 | .p2align 4 |
437 | L(CopyFrom1To16BytesCase3): |
438 | add $16, %ebx |
439 | add %esi, %edx |
440 | add %esi, %ecx |
441 | |
442 | POP (%esi) |
443 | |
444 | cmp $8, %ebx |
445 | ja L(ExitHighCase3) |
446 | cmp $1, %ebx |
447 | je L(StrncatExit1) |
448 | cmp $2, %ebx |
449 | je L(StrncatExit2) |
450 | cmp $3, %ebx |
451 | je L(StrncatExit3) |
452 | cmp $4, %ebx |
453 | je L(StrncatExit4) |
454 | cmp $5, %ebx |
455 | je L(StrncatExit5) |
456 | cmp $6, %ebx |
457 | je L(StrncatExit6) |
458 | cmp $7, %ebx |
459 | je L(StrncatExit7) |
460 | movlpd (%ecx), %xmm0 |
461 | movlpd %xmm0, (%edx) |
462 | movb %bh, 8(%edx) |
463 | movl %edi, %eax |
464 | RETURN1 |
465 | |
466 | .p2align 4 |
467 | L(ExitHighCase3): |
468 | cmp $9, %ebx |
469 | je L(StrncatExit9) |
470 | cmp $10, %ebx |
471 | je L(StrncatExit10) |
472 | cmp $11, %ebx |
473 | je L(StrncatExit11) |
474 | cmp $12, %ebx |
475 | je L(StrncatExit12) |
476 | cmp $13, %ebx |
477 | je L(StrncatExit13) |
478 | cmp $14, %ebx |
479 | je L(StrncatExit14) |
480 | cmp $15, %ebx |
481 | je L(StrncatExit15) |
482 | movlpd (%ecx), %xmm0 |
483 | movlpd %xmm0, (%edx) |
484 | movlpd 8(%ecx), %xmm1 |
485 | movlpd %xmm1, 8(%edx) |
486 | movb %bh, 16(%edx) |
487 | movl %edi, %eax |
488 | RETURN1 |
489 | |
490 | .p2align 4 |
491 | L(StrncatExit0): |
492 | movl %edi, %eax |
493 | RETURN1 |
494 | |
495 | .p2align 4 |
496 | L(StrncatExit15Bytes): |
497 | cmp $9, %ebx |
498 | je L(StrncatExit9) |
499 | cmpb $0, 9(%ecx) |
500 | jz L(Exit10) |
501 | cmp $10, %ebx |
502 | je L(StrncatExit10) |
503 | cmpb $0, 10(%ecx) |
504 | jz L(Exit11) |
505 | cmp $11, %ebx |
506 | je L(StrncatExit11) |
507 | cmpb $0, 11(%ecx) |
508 | jz L(Exit12) |
509 | cmp $12, %ebx |
510 | je L(StrncatExit12) |
511 | cmpb $0, 12(%ecx) |
512 | jz L(Exit13) |
513 | cmp $13, %ebx |
514 | je L(StrncatExit13) |
515 | cmpb $0, 13(%ecx) |
516 | jz L(Exit14) |
517 | cmp $14, %ebx |
518 | je L(StrncatExit14) |
519 | movlpd (%ecx), %xmm0 |
520 | movlpd %xmm0, (%edx) |
521 | movlpd 7(%ecx), %xmm0 |
522 | movlpd %xmm0, 7(%edx) |
523 | lea 14(%edx), %eax |
524 | cmpb $1, (%eax) |
525 | sbb $-1, %eax |
526 | movb %bh, (%eax) |
527 | movl %edi, %eax |
528 | RETURN1 |
529 | |
530 | .p2align 4 |
531 | L(StrncatExit8Bytes): |
532 | cmpb $0, (%ecx) |
533 | jz L(Exit1) |
534 | cmp $1, %ebx |
535 | je L(StrncatExit1) |
536 | cmpb $0, 1(%ecx) |
537 | jz L(Exit2) |
538 | cmp $2, %ebx |
539 | je L(StrncatExit2) |
540 | cmpb $0, 2(%ecx) |
541 | jz L(Exit3) |
542 | cmp $3, %ebx |
543 | je L(StrncatExit3) |
544 | cmpb $0, 3(%ecx) |
545 | jz L(Exit4) |
546 | cmp $4, %ebx |
547 | je L(StrncatExit4) |
548 | cmpb $0, 4(%ecx) |
549 | jz L(Exit5) |
550 | cmp $5, %ebx |
551 | je L(StrncatExit5) |
552 | cmpb $0, 5(%ecx) |
553 | jz L(Exit6) |
554 | cmp $6, %ebx |
555 | je L(StrncatExit6) |
556 | cmpb $0, 6(%ecx) |
557 | jz L(Exit7) |
558 | cmp $7, %ebx |
559 | je L(StrncatExit7) |
560 | movlpd (%ecx), %xmm0 |
561 | movlpd %xmm0, (%edx) |
562 | lea 7(%edx), %eax |
563 | cmpb $1, (%eax) |
564 | sbb $-1, %eax |
565 | movb %bh, (%eax) |
566 | movl %edi, %eax |
567 | RETURN1 |
568 | |
569 | # endif |
570 | END (STRCAT) |
571 | #endif |
572 | |