1/* memcpy optimized with SSE2 unaligned memory access instructions.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc) \
20 && (defined SHARED \
21 || defined USE_AS_MEMMOVE \
22 || !defined USE_MULTIARCH)
23
24# include <sysdep.h>
25# include "asm-syntax.h"
26
27# ifndef MEMCPY
28# define MEMCPY __memcpy_sse2_unaligned
29# define MEMCPY_CHK __memcpy_chk_sse2_unaligned
30# endif
31
32# define DEST PARMS
33# define SRC DEST+4
34# define LEN SRC+4
35
36# define CFI_PUSH(REG) \
37 cfi_adjust_cfa_offset (4); \
38 cfi_rel_offset (REG, 0)
39
40# define CFI_POP(REG) \
41 cfi_adjust_cfa_offset (-4); \
42 cfi_restore (REG)
43
44# define PUSH(REG) pushl REG; CFI_PUSH (REG)
45# define POP(REG) popl REG; CFI_POP (REG)
46
47# define PARMS 8 /* Preserve EBX. */
48# define ENTRANCE PUSH (%ebx);
49# define RETURN_END POP (%ebx); ret
50# define RETURN RETURN_END; CFI_PUSH (%ebx)
51
52 .section .text.sse2,"ax",@progbits
53# if defined SHARED
54ENTRY (MEMCPY_CHK)
55 movl 12(%esp), %eax
56 cmpl %eax, 16(%esp)
57 jb HIDDEN_JUMPTARGET (__chk_fail)
58END (MEMCPY_CHK)
59# endif
60
61ENTRY (MEMCPY)
62 ENTRANCE
63 movl LEN(%esp), %ecx
64 movl SRC(%esp), %eax
65 movl DEST(%esp), %edx
66 cmp %edx, %eax
67
68# ifdef USE_AS_MEMMOVE
69 ja L(check_forward)
70
71L(mm_len_0_or_more_backward):
72/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
73 separately. */
74 cmp $16, %ecx
75 jbe L(mm_len_0_16_bytes_backward)
76
77 cmpl $32, %ecx
78 ja L(mm_len_32_or_more_backward)
79
80/* Copy [0..32] and return. */
81 movdqu (%eax), %xmm0
82 movdqu -16(%eax, %ecx), %xmm1
83 movdqu %xmm0, (%edx)
84 movdqu %xmm1, -16(%edx, %ecx)
85 jmp L(return)
86
87L(mm_len_32_or_more_backward):
88 cmpl $64, %ecx
89 ja L(mm_len_64_or_more_backward)
90
91/* Copy [0..64] and return. */
92 movdqu (%eax), %xmm0
93 movdqu 16(%eax), %xmm1
94 movdqu -16(%eax, %ecx), %xmm2
95 movdqu -32(%eax, %ecx), %xmm3
96 movdqu %xmm0, (%edx)
97 movdqu %xmm1, 16(%edx)
98 movdqu %xmm2, -16(%edx, %ecx)
99 movdqu %xmm3, -32(%edx, %ecx)
100 jmp L(return)
101
102L(mm_len_64_or_more_backward):
103 cmpl $128, %ecx
104 ja L(mm_len_128_or_more_backward)
105
106/* Copy [0..128] and return. */
107 movdqu (%eax), %xmm0
108 movdqu 16(%eax), %xmm1
109 movdqu 32(%eax), %xmm2
110 movdqu 48(%eax), %xmm3
111 movdqu -64(%eax, %ecx), %xmm4
112 movdqu -48(%eax, %ecx), %xmm5
113 movdqu -32(%eax, %ecx), %xmm6
114 movdqu -16(%eax, %ecx), %xmm7
115 movdqu %xmm0, (%edx)
116 movdqu %xmm1, 16(%edx)
117 movdqu %xmm2, 32(%edx)
118 movdqu %xmm3, 48(%edx)
119 movdqu %xmm4, -64(%edx, %ecx)
120 movdqu %xmm5, -48(%edx, %ecx)
121 movdqu %xmm6, -32(%edx, %ecx)
122 movdqu %xmm7, -16(%edx, %ecx)
123 jmp L(return)
124
125L(mm_len_128_or_more_backward):
126 add %ecx, %eax
127 cmp %edx, %eax
128 movl SRC(%esp), %eax
129 jbe L(forward)
130 PUSH (%esi)
131 PUSH (%edi)
132 PUSH (%ebx)
133
134/* Aligning the address of destination. */
135 movdqu (%eax), %xmm4
136 movdqu 16(%eax), %xmm5
137 movdqu 32(%eax), %xmm6
138 movdqu 48(%eax), %xmm7
139 leal (%edx, %ecx), %esi
140 movdqu -16(%eax, %ecx), %xmm0
141 subl $16, %esp
142 movdqu %xmm0, (%esp)
143 mov %ecx, %edi
144 movl %esi, %ecx
145 andl $-16, %ecx
146 leal (%ecx), %ebx
147 subl %edx, %ebx
148 leal (%eax, %ebx), %eax
149 shrl $6, %ebx
150
151# ifdef SHARED_CACHE_SIZE_HALF
152 cmp $SHARED_CACHE_SIZE_HALF, %edi
153# else
154# ifdef PIC
155 PUSH (%ebx)
156 SETUP_PIC_REG (bx)
157 add $_GLOBAL_OFFSET_TABLE_, %ebx
158 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
159 POP (%ebx)
160# else
161 cmp __x86_shared_cache_size_half, %edi
162# endif
163# endif
164 jae L(mm_large_page_loop_backward)
165
166 .p2align 4
167L(mm_main_loop_backward):
168
169 prefetcht0 -128(%eax)
170
171 movdqu -64(%eax), %xmm0
172 movdqu -48(%eax), %xmm1
173 movdqu -32(%eax), %xmm2
174 movdqu -16(%eax), %xmm3
175 movaps %xmm0, -64(%ecx)
176 subl $64, %eax
177 movaps %xmm1, -48(%ecx)
178 movaps %xmm2, -32(%ecx)
179 movaps %xmm3, -16(%ecx)
180 subl $64, %ecx
181 sub $1, %ebx
182 jnz L(mm_main_loop_backward)
183 movdqu (%esp), %xmm0
184 addl $16, %esp
185 movdqu %xmm0, -16(%esi)
186 movdqu %xmm4, (%edx)
187 movdqu %xmm5, 16(%edx)
188 movdqu %xmm6, 32(%edx)
189 movdqu %xmm7, 48(%edx)
190 POP (%ebx)
191 jmp L(mm_return_pop_all)
192
193/* Copy [0..16] and return. */
194L(mm_len_0_16_bytes_backward):
195 testb $24, %cl
196 jnz L(mm_len_9_16_bytes_backward)
197 testb $4, %cl
198 .p2align 4,,5
199 jnz L(mm_len_5_8_bytes_backward)
200 testl %ecx, %ecx
201 .p2align 4,,2
202 je L(return)
203 testb $2, %cl
204 .p2align 4,,1
205 jne L(mm_len_3_4_bytes_backward)
206 movzbl -1(%eax,%ecx), %ebx
207 movzbl (%eax), %eax
208 movb %bl, -1(%edx,%ecx)
209 movb %al, (%edx)
210 jmp L(return)
211
212L(mm_len_3_4_bytes_backward):
213 movzwl -2(%eax,%ecx), %ebx
214 movzwl (%eax), %eax
215 movw %bx, -2(%edx,%ecx)
216 movw %ax, (%edx)
217 jmp L(return)
218
219L(mm_len_9_16_bytes_backward):
220 PUSH (%esi)
221 movl -4(%eax,%ecx), %ebx
222 movl -8(%eax,%ecx), %esi
223 movl %ebx, -4(%edx,%ecx)
224 movl %esi, -8(%edx,%ecx)
225 subl $8, %ecx
226 POP (%esi)
227 jmp L(mm_len_0_16_bytes_backward)
228
229L(mm_len_5_8_bytes_backward):
230 movl (%eax), %ebx
231 movl -4(%eax,%ecx), %eax
232 movl %ebx, (%edx)
233 movl %eax, -4(%edx,%ecx)
234 jmp L(return)
235
236/* Big length copy backward part. */
237 .p2align 4
238L(mm_large_page_loop_backward):
239 movdqu -64(%eax), %xmm0
240 movdqu -48(%eax), %xmm1
241 movdqu -32(%eax), %xmm2
242 movdqu -16(%eax), %xmm3
243 movntdq %xmm0, -64(%ecx)
244 subl $64, %eax
245 movntdq %xmm1, -48(%ecx)
246 movntdq %xmm2, -32(%ecx)
247 movntdq %xmm3, -16(%ecx)
248 subl $64, %ecx
249 sub $1, %ebx
250 jnz L(mm_large_page_loop_backward)
251 sfence
252 movdqu (%esp), %xmm0
253 addl $16, %esp
254 movdqu %xmm0, -16(%esi)
255 movdqu %xmm4, (%edx)
256 movdqu %xmm5, 16(%edx)
257 movdqu %xmm6, 32(%edx)
258 movdqu %xmm7, 48(%edx)
259 POP (%ebx)
260 jmp L(mm_return_pop_all)
261
262L(check_forward):
263 add %edx, %ecx
264 cmp %eax, %ecx
265 movl LEN(%esp), %ecx
266 jbe L(forward)
267
268/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
269 separately. */
270 cmp $16, %ecx
271 jbe L(mm_len_0_16_bytes_forward)
272
273 cmpl $32, %ecx
274 ja L(mm_len_32_or_more_forward)
275
276/* Copy [0..32] and return. */
277 movdqu (%eax), %xmm0
278 movdqu -16(%eax, %ecx), %xmm1
279 movdqu %xmm0, (%edx)
280 movdqu %xmm1, -16(%edx, %ecx)
281 jmp L(return)
282
283L(mm_len_32_or_more_forward):
284 cmpl $64, %ecx
285 ja L(mm_len_64_or_more_forward)
286
287/* Copy [0..64] and return. */
288 movdqu (%eax), %xmm0
289 movdqu 16(%eax), %xmm1
290 movdqu -16(%eax, %ecx), %xmm2
291 movdqu -32(%eax, %ecx), %xmm3
292 movdqu %xmm0, (%edx)
293 movdqu %xmm1, 16(%edx)
294 movdqu %xmm2, -16(%edx, %ecx)
295 movdqu %xmm3, -32(%edx, %ecx)
296 jmp L(return)
297
298L(mm_len_64_or_more_forward):
299 cmpl $128, %ecx
300 ja L(mm_len_128_or_more_forward)
301
302/* Copy [0..128] and return. */
303 movdqu (%eax), %xmm0
304 movdqu 16(%eax), %xmm1
305 movdqu 32(%eax), %xmm2
306 movdqu 48(%eax), %xmm3
307 movdqu -64(%eax, %ecx), %xmm4
308 movdqu -48(%eax, %ecx), %xmm5
309 movdqu -32(%eax, %ecx), %xmm6
310 movdqu -16(%eax, %ecx), %xmm7
311 movdqu %xmm0, (%edx)
312 movdqu %xmm1, 16(%edx)
313 movdqu %xmm2, 32(%edx)
314 movdqu %xmm3, 48(%edx)
315 movdqu %xmm4, -64(%edx, %ecx)
316 movdqu %xmm5, -48(%edx, %ecx)
317 movdqu %xmm6, -32(%edx, %ecx)
318 movdqu %xmm7, -16(%edx, %ecx)
319 jmp L(return)
320
321L(mm_len_128_or_more_forward):
322 PUSH (%esi)
323 PUSH (%edi)
324 PUSH (%ebx)
325
326/* Aligning the address of destination. */
327 movdqu -16(%eax, %ecx), %xmm4
328 movdqu -32(%eax, %ecx), %xmm5
329 movdqu -48(%eax, %ecx), %xmm6
330 movdqu -64(%eax, %ecx), %xmm7
331 leal (%edx, %ecx), %esi
332 movdqu (%eax), %xmm0
333 subl $16, %esp
334 movdqu %xmm0, (%esp)
335 mov %ecx, %edi
336 leal 16(%edx), %ecx
337 andl $-16, %ecx
338 movl %ecx, %ebx
339 subl %edx, %ebx
340 addl %ebx, %eax
341 movl %esi, %ebx
342 subl %ecx, %ebx
343 shrl $6, %ebx
344
345# ifdef SHARED_CACHE_SIZE_HALF
346 cmp $SHARED_CACHE_SIZE_HALF, %edi
347# else
348# ifdef PIC
349 PUSH (%ebx)
350 SETUP_PIC_REG(bx)
351 add $_GLOBAL_OFFSET_TABLE_, %ebx
352 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
353 POP (%ebx)
354# else
355 cmp __x86_shared_cache_size_half, %edi
356# endif
357# endif
358 jae L(mm_large_page_loop_forward)
359
360 .p2align 4
361L(mm_main_loop_forward):
362
363 prefetcht0 128(%eax)
364
365 movdqu (%eax), %xmm0
366 movdqu 16(%eax), %xmm1
367 movdqu 32(%eax), %xmm2
368 movdqu 48(%eax), %xmm3
369 movdqa %xmm0, (%ecx)
370 addl $64, %eax
371 movaps %xmm1, 16(%ecx)
372 movaps %xmm2, 32(%ecx)
373 movaps %xmm3, 48(%ecx)
374 addl $64, %ecx
375 sub $1, %ebx
376 jnz L(mm_main_loop_forward)
377 movdqu (%esp), %xmm0
378 addl $16, %esp
379 movdqu %xmm0, (%edx)
380 movdqu %xmm4, -16(%esi)
381 movdqu %xmm5, -32(%esi)
382 movdqu %xmm6, -48(%esi)
383 movdqu %xmm7, -64(%esi)
384 POP (%ebx)
385 jmp L(mm_return_pop_all)
386
387L(mm_len_0_16_bytes_forward):
388 testb $24, %cl
389 jne L(mm_len_9_16_bytes_forward)
390 testb $4, %cl
391 .p2align 4,,5
392 jne L(mm_len_5_8_bytes_forward)
393 testl %ecx, %ecx
394 .p2align 4,,2
395 je L(return)
396 testb $2, %cl
397 .p2align 4,,1
398 jne L(mm_len_2_4_bytes_forward)
399 movzbl -1(%eax,%ecx), %ebx
400 movzbl (%eax), %eax
401 movb %bl, -1(%edx,%ecx)
402 movb %al, (%edx)
403 jmp L(return)
404
405L(mm_len_2_4_bytes_forward):
406 movzwl -2(%eax,%ecx), %ebx
407 movzwl (%eax), %eax
408 movw %bx, -2(%edx,%ecx)
409 movw %ax, (%edx)
410 jmp L(return)
411
412L(mm_len_5_8_bytes_forward):
413 movl (%eax), %ebx
414 movl -4(%eax,%ecx), %eax
415 movl %ebx, (%edx)
416 movl %eax, -4(%edx,%ecx)
417 jmp L(return)
418
419L(mm_len_9_16_bytes_forward):
420 movq (%eax), %xmm0
421 movq -8(%eax, %ecx), %xmm1
422 movq %xmm0, (%edx)
423 movq %xmm1, -8(%edx, %ecx)
424 jmp L(return)
425
426L(mm_return_pop_all):
427 movl %edx, %eax
428 POP (%edi)
429 POP (%esi)
430 RETURN
431
432/* Big length copy forward part. */
433 .p2align 4
434L(mm_large_page_loop_forward):
435 movdqu (%eax), %xmm0
436 movdqu 16(%eax), %xmm1
437 movdqu 32(%eax), %xmm2
438 movdqu 48(%eax), %xmm3
439 movntdq %xmm0, (%ecx)
440 addl $64, %eax
441 movntdq %xmm1, 16(%ecx)
442 movntdq %xmm2, 32(%ecx)
443 movntdq %xmm3, 48(%ecx)
444 addl $64, %ecx
445 sub $1, %ebx
446 jnz L(mm_large_page_loop_forward)
447 sfence
448 movdqu (%esp), %xmm0
449 addl $16, %esp
450 movdqu %xmm0, (%edx)
451 movdqu %xmm4, -16(%esi)
452 movdqu %xmm5, -32(%esi)
453 movdqu %xmm6, -48(%esi)
454 movdqu %xmm7, -64(%esi)
455 POP (%ebx)
456 jmp L(mm_return_pop_all)
457# endif
458
459L(forward):
460 cmp $16, %ecx
461 jbe L(len_0_16_bytes)
462
463# ifdef SHARED_CACHE_SIZE_HALF
464 cmp $SHARED_CACHE_SIZE_HALF, %ecx
465# else
466# ifdef PIC
467 SETUP_PIC_REG(bx)
468 add $_GLOBAL_OFFSET_TABLE_, %ebx
469 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
470# else
471 cmp __x86_shared_cache_size_half, %ecx
472# endif
473# endif
474 jae L(large_page)
475
476 movdqu (%eax), %xmm0
477 movdqu -16(%eax, %ecx), %xmm1
478 cmpl $32, %ecx
479 movdqu %xmm0, (%edx)
480 movdqu %xmm1, -16(%edx, %ecx)
481 jbe L(return)
482
483 movdqu 16(%eax), %xmm0
484 movdqu -32(%eax, %ecx), %xmm1
485 cmpl $64, %ecx
486 movdqu %xmm0, 16(%edx)
487 movdqu %xmm1, -32(%edx, %ecx)
488 jbe L(return)
489
490 movdqu 32(%eax), %xmm0
491 movdqu 48(%eax), %xmm1
492 movdqu -48(%eax, %ecx), %xmm2
493 movdqu -64(%eax, %ecx), %xmm3
494 cmpl $128, %ecx
495 movdqu %xmm0, 32(%edx)
496 movdqu %xmm1, 48(%edx)
497 movdqu %xmm2, -48(%edx, %ecx)
498 movdqu %xmm3, -64(%edx, %ecx)
499 jbe L(return)
500
501/* Now the main loop: we align the address of the destination. */
502 leal 64(%edx), %ebx
503 andl $-64, %ebx
504
505 addl %edx, %ecx
506 andl $-64, %ecx
507
508 subl %edx, %eax
509
510/* We should stop two iterations before the termination
511 (in order not to misprefetch). */
512 subl $64, %ecx
513 cmpl %ebx, %ecx
514 je L(main_loop_just_one_iteration)
515
516 subl $64, %ecx
517 cmpl %ebx, %ecx
518 je L(main_loop_last_two_iterations)
519
520 .p2align 4
521L(main_loop_cache):
522
523 prefetcht0 128(%ebx, %eax)
524
525 movdqu (%ebx, %eax), %xmm0
526 movdqu 16(%ebx, %eax), %xmm1
527 movdqu 32(%ebx, %eax), %xmm2
528 movdqu 48(%ebx, %eax), %xmm3
529 movdqa %xmm0, (%ebx)
530 movaps %xmm1, 16(%ebx)
531 movaps %xmm2, 32(%ebx)
532 movaps %xmm3, 48(%ebx)
533 lea 64(%ebx), %ebx
534 cmpl %ebx, %ecx
535 jne L(main_loop_cache)
536
537L(main_loop_last_two_iterations):
538 movdqu (%ebx, %eax), %xmm0
539 movdqu 16(%ebx, %eax), %xmm1
540 movdqu 32(%ebx, %eax), %xmm2
541 movdqu 48(%ebx, %eax), %xmm3
542 movdqu 64(%ebx, %eax), %xmm4
543 movdqu 80(%ebx, %eax), %xmm5
544 movdqu 96(%ebx, %eax), %xmm6
545 movdqu 112(%ebx, %eax), %xmm7
546 movdqa %xmm0, (%ebx)
547 movaps %xmm1, 16(%ebx)
548 movaps %xmm2, 32(%ebx)
549 movaps %xmm3, 48(%ebx)
550 movaps %xmm4, 64(%ebx)
551 movaps %xmm5, 80(%ebx)
552 movaps %xmm6, 96(%ebx)
553 movaps %xmm7, 112(%ebx)
554 jmp L(return)
555
556L(main_loop_just_one_iteration):
557 movdqu (%ebx, %eax), %xmm0
558 movdqu 16(%ebx, %eax), %xmm1
559 movdqu 32(%ebx, %eax), %xmm2
560 movdqu 48(%ebx, %eax), %xmm3
561 movdqa %xmm0, (%ebx)
562 movaps %xmm1, 16(%ebx)
563 movaps %xmm2, 32(%ebx)
564 movaps %xmm3, 48(%ebx)
565 jmp L(return)
566
567L(large_page):
568 movdqu (%eax), %xmm0
569 movdqu 16(%eax), %xmm1
570 movdqu 32(%eax), %xmm2
571 movdqu 48(%eax), %xmm3
572 movdqu -64(%eax, %ecx), %xmm4
573 movdqu -48(%eax, %ecx), %xmm5
574 movdqu -32(%eax, %ecx), %xmm6
575 movdqu -16(%eax, %ecx), %xmm7
576 movdqu %xmm0, (%edx)
577 movdqu %xmm1, 16(%edx)
578 movdqu %xmm2, 32(%edx)
579 movdqu %xmm3, 48(%edx)
580 movdqu %xmm4, -64(%edx, %ecx)
581 movdqu %xmm5, -48(%edx, %ecx)
582 movdqu %xmm6, -32(%edx, %ecx)
583 movdqu %xmm7, -16(%edx, %ecx)
584
585 movdqu 64(%eax), %xmm0
586 movdqu 80(%eax), %xmm1
587 movdqu 96(%eax), %xmm2
588 movdqu 112(%eax), %xmm3
589 movdqu -128(%eax, %ecx), %xmm4
590 movdqu -112(%eax, %ecx), %xmm5
591 movdqu -96(%eax, %ecx), %xmm6
592 movdqu -80(%eax, %ecx), %xmm7
593 movdqu %xmm0, 64(%edx)
594 movdqu %xmm1, 80(%edx)
595 movdqu %xmm2, 96(%edx)
596 movdqu %xmm3, 112(%edx)
597 movdqu %xmm4, -128(%edx, %ecx)
598 movdqu %xmm5, -112(%edx, %ecx)
599 movdqu %xmm6, -96(%edx, %ecx)
600 movdqu %xmm7, -80(%edx, %ecx)
601
602/* Now the main loop with non temporal stores. We align
603 the address of the destination. */
604 leal 128(%edx), %ebx
605 andl $-128, %ebx
606
607 addl %edx, %ecx
608 andl $-128, %ecx
609
610 subl %edx, %eax
611
612 .p2align 4
613L(main_loop_large_page):
614 movdqu (%ebx, %eax), %xmm0
615 movdqu 16(%ebx, %eax), %xmm1
616 movdqu 32(%ebx, %eax), %xmm2
617 movdqu 48(%ebx, %eax), %xmm3
618 movdqu 64(%ebx, %eax), %xmm4
619 movdqu 80(%ebx, %eax), %xmm5
620 movdqu 96(%ebx, %eax), %xmm6
621 movdqu 112(%ebx, %eax), %xmm7
622 movntdq %xmm0, (%ebx)
623 movntdq %xmm1, 16(%ebx)
624 movntdq %xmm2, 32(%ebx)
625 movntdq %xmm3, 48(%ebx)
626 movntdq %xmm4, 64(%ebx)
627 movntdq %xmm5, 80(%ebx)
628 movntdq %xmm6, 96(%ebx)
629 movntdq %xmm7, 112(%ebx)
630 lea 128(%ebx), %ebx
631 cmpl %ebx, %ecx
632 jne L(main_loop_large_page)
633 sfence
634 jmp L(return)
635
636L(len_0_16_bytes):
637 testb $24, %cl
638 jne L(len_9_16_bytes)
639 testb $4, %cl
640 .p2align 4,,5
641 jne L(len_5_8_bytes)
642 testl %ecx, %ecx
643 .p2align 4,,2
644 je L(return)
645 movzbl (%eax), %ebx
646 testb $2, %cl
647 movb %bl, (%edx)
648 je L(return)
649 movzwl -2(%eax,%ecx), %ebx
650 movw %bx, -2(%edx,%ecx)
651 jmp L(return)
652
653L(len_9_16_bytes):
654 movq (%eax), %xmm0
655 movq -8(%eax, %ecx), %xmm1
656 movq %xmm0, (%edx)
657 movq %xmm1, -8(%edx, %ecx)
658 jmp L(return)
659
660L(len_5_8_bytes):
661 movl (%eax), %ebx
662 movl %ebx, (%edx)
663 movl -4(%eax,%ecx), %ebx
664 movl %ebx, -4(%edx,%ecx)
665
666L(return):
667 movl %edx, %eax
668# ifdef USE_AS_MEMPCPY
669 movl LEN(%esp), %ecx
670 add %ecx, %eax
671# endif
672 RETURN
673
674END (MEMCPY)
675#endif
676

source code of glibc/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S