1/* memcpy with SSSE3
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc) \
20 && (defined SHARED \
21 || defined USE_AS_MEMMOVE \
22 || !defined USE_MULTIARCH)
23
24# include <sysdep.h>
25# include "asm-syntax.h"
26
27# ifndef MEMCPY
28# define MEMCPY __memcpy_ssse3
29# define MEMCPY_CHK __memcpy_chk_ssse3
30# endif
31
32# define DEST PARMS
33# define SRC DEST+4
34# define LEN SRC+4
35
36# define CFI_PUSH(REG) \
37 cfi_adjust_cfa_offset (4); \
38 cfi_rel_offset (REG, 0)
39
40# define CFI_POP(REG) \
41 cfi_adjust_cfa_offset (-4); \
42 cfi_restore (REG)
43
44# define PUSH(REG) pushl REG; CFI_PUSH (REG)
45# define POP(REG) popl REG; CFI_POP (REG)
46
47# ifdef PIC
48# define PARMS 8 /* Preserve EBX. */
49# define ENTRANCE PUSH (%ebx);
50# define RETURN_END POP (%ebx); ret
51# define RETURN RETURN_END; CFI_PUSH (%ebx)
52# define JMPTBL(I, B) I - B
53
54/* Load an entry in a jump table into EBX and branch to it. TABLE is a
55 jump table with relative offsets. INDEX is a register contains the
56 index into the jump table. SCALE is the scale of INDEX. */
57
58# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
59 /* We first load PC into EBX. */ \
60 SETUP_PIC_REG(bx); \
61 /* Get the address of the jump table. */ \
62 addl $(TABLE - .), %ebx; \
63 /* Get the entry and convert the relative offset to the \
64 absolute address. */ \
65 addl (%ebx, INDEX, SCALE), %ebx; \
66 /* We loaded the jump table. Go. */ \
67 jmp *%ebx
68# else
69
70# define PARMS 4
71# define ENTRANCE
72# define RETURN_END ret
73# define RETURN RETURN_END
74# define JMPTBL(I, B) I
75
76/* Branch to an entry in a jump table. TABLE is a jump table with
77 absolute offsets. INDEX is a register contains the index into the
78 jump table. SCALE is the scale of INDEX. */
79
80# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
81 jmp *TABLE(, INDEX, SCALE)
82# endif
83
84 .section .text.ssse3,"ax",@progbits
85# ifdef SHARED
86ENTRY (MEMCPY_CHK)
87 movl 12(%esp), %eax
88 cmpl %eax, 16(%esp)
89 jb HIDDEN_JUMPTARGET (__chk_fail)
90END (MEMCPY_CHK)
91# endif
92ENTRY (MEMCPY)
93 ENTRANCE
94 movl LEN(%esp), %ecx
95 movl SRC(%esp), %eax
96 movl DEST(%esp), %edx
97
98# ifdef USE_AS_MEMMOVE
99 cmp %eax, %edx
100 jb L(copy_forward)
101 je L(fwd_write_0bytes)
102 cmp $32, %ecx
103 jae L(memmove_bwd)
104 jmp L(bk_write_less32bytes_2)
105
106 .p2align 4
107L(memmove_bwd):
108 add %ecx, %eax
109 cmp %eax, %edx
110 movl SRC(%esp), %eax
111 jb L(copy_backward)
112
113L(copy_forward):
114# endif
115 cmp $48, %ecx
116 jae L(48bytesormore)
117
118L(fwd_write_less32bytes):
119# ifndef USE_AS_MEMMOVE
120 cmp %dl, %al
121 jb L(bk_write)
122# endif
123 add %ecx, %edx
124 add %ecx, %eax
125 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
126# ifndef USE_AS_MEMMOVE
127 .p2align 4
128L(bk_write):
129 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
130# endif
131
132 .p2align 4
133L(48bytesormore):
134# ifndef USE_AS_MEMMOVE
135 movlpd (%eax), %xmm0
136 movlpd 8(%eax), %xmm1
137 movlpd %xmm0, (%edx)
138 movlpd %xmm1, 8(%edx)
139# else
140 movdqu (%eax), %xmm0
141# endif
142 PUSH (%edi)
143 movl %edx, %edi
144 and $-16, %edx
145 add $16, %edx
146 sub %edx, %edi
147 add %edi, %ecx
148 sub %edi, %eax
149
150# ifdef SHARED_CACHE_SIZE_HALF
151 cmp $SHARED_CACHE_SIZE_HALF, %ecx
152# else
153# ifdef PIC
154 SETUP_PIC_REG(bx)
155 add $_GLOBAL_OFFSET_TABLE_, %ebx
156 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
157# else
158 cmp __x86_shared_cache_size_half, %ecx
159# endif
160# endif
161
162 mov %eax, %edi
163 jae L(large_page)
164 and $0xf, %edi
165 jz L(shl_0)
166 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
167
168 .p2align 4
169L(shl_0):
170# ifdef USE_AS_MEMMOVE
171 movl DEST+4(%esp), %edi
172 movdqu %xmm0, (%edi)
173# endif
174 xor %edi, %edi
175 cmp $127, %ecx
176 ja L(shl_0_gobble)
177 lea -32(%ecx), %ecx
178
179 .p2align 4
180L(shl_0_loop):
181 movdqa (%eax, %edi), %xmm0
182 movdqa 16(%eax, %edi), %xmm1
183 sub $32, %ecx
184 movdqa %xmm0, (%edx, %edi)
185 movdqa %xmm1, 16(%edx, %edi)
186 lea 32(%edi), %edi
187 jb L(shl_0_end)
188
189 movdqa (%eax, %edi), %xmm0
190 movdqa 16(%eax, %edi), %xmm1
191 sub $32, %ecx
192 movdqa %xmm0, (%edx, %edi)
193 movdqa %xmm1, 16(%edx, %edi)
194 lea 32(%edi), %edi
195 jb L(shl_0_end)
196
197 movdqa (%eax, %edi), %xmm0
198 movdqa 16(%eax, %edi), %xmm1
199 sub $32, %ecx
200 movdqa %xmm0, (%edx, %edi)
201 movdqa %xmm1, 16(%edx, %edi)
202 lea 32(%edi), %edi
203 jb L(shl_0_end)
204
205 movdqa (%eax, %edi), %xmm0
206 movdqa 16(%eax, %edi), %xmm1
207 sub $32, %ecx
208 movdqa %xmm0, (%edx, %edi)
209 movdqa %xmm1, 16(%edx, %edi)
210 lea 32(%edi), %edi
211
212L(shl_0_end):
213 lea 32(%ecx), %ecx
214 add %ecx, %edi
215 add %edi, %edx
216 add %edi, %eax
217 POP (%edi)
218 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
219
220 CFI_PUSH (%edi)
221
222 .p2align 4
223L(shl_0_gobble):
224# ifdef DATA_CACHE_SIZE_HALF
225 cmp $DATA_CACHE_SIZE_HALF, %ecx
226# else
227# ifdef PIC
228 SETUP_PIC_REG(bx)
229 add $_GLOBAL_OFFSET_TABLE_, %ebx
230 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
231# else
232 cmp __x86_data_cache_size_half, %ecx
233# endif
234# endif
235 POP (%edi)
236 lea -128(%ecx), %ecx
237 jae L(shl_0_gobble_mem_loop)
238
239 .p2align 4
240L(shl_0_gobble_cache_loop):
241 movdqa (%eax), %xmm0
242 movdqa 0x10(%eax), %xmm1
243 movdqa 0x20(%eax), %xmm2
244 movdqa 0x30(%eax), %xmm3
245 movdqa 0x40(%eax), %xmm4
246 movdqa 0x50(%eax), %xmm5
247 movdqa 0x60(%eax), %xmm6
248 movdqa 0x70(%eax), %xmm7
249 lea 0x80(%eax), %eax
250 sub $128, %ecx
251 movdqa %xmm0, (%edx)
252 movdqa %xmm1, 0x10(%edx)
253 movdqa %xmm2, 0x20(%edx)
254 movdqa %xmm3, 0x30(%edx)
255 movdqa %xmm4, 0x40(%edx)
256 movdqa %xmm5, 0x50(%edx)
257 movdqa %xmm6, 0x60(%edx)
258 movdqa %xmm7, 0x70(%edx)
259 lea 0x80(%edx), %edx
260
261 jae L(shl_0_gobble_cache_loop)
262 cmp $-0x40, %ecx
263 lea 0x80(%ecx), %ecx
264 jl L(shl_0_cache_less_64bytes)
265
266 movdqa (%eax), %xmm0
267 sub $0x40, %ecx
268 movdqa 0x10(%eax), %xmm1
269 movdqa %xmm0, (%edx)
270 movdqa %xmm1, 0x10(%edx)
271 movdqa 0x20(%eax), %xmm0
272 movdqa 0x30(%eax), %xmm1
273 add $0x40, %eax
274 movdqa %xmm0, 0x20(%edx)
275 movdqa %xmm1, 0x30(%edx)
276 add $0x40, %edx
277
278L(shl_0_cache_less_64bytes):
279 cmp $0x20, %ecx
280 jb L(shl_0_cache_less_32bytes)
281 movdqa (%eax), %xmm0
282 sub $0x20, %ecx
283 movdqa 0x10(%eax), %xmm1
284 add $0x20, %eax
285 movdqa %xmm0, (%edx)
286 movdqa %xmm1, 0x10(%edx)
287 add $0x20, %edx
288
289L(shl_0_cache_less_32bytes):
290 cmp $0x10, %ecx
291 jb L(shl_0_cache_less_16bytes)
292 sub $0x10, %ecx
293 movdqa (%eax), %xmm0
294 add $0x10, %eax
295 movdqa %xmm0, (%edx)
296 add $0x10, %edx
297
298L(shl_0_cache_less_16bytes):
299 add %ecx, %edx
300 add %ecx, %eax
301 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
302
303 .p2align 4
304L(shl_0_gobble_mem_loop):
305 prefetcht0 0x1c0(%eax)
306 prefetcht0 0x280(%eax)
307 prefetcht0 0x1c0(%edx)
308
309 movdqa (%eax), %xmm0
310 movdqa 0x10(%eax), %xmm1
311 movdqa 0x20(%eax), %xmm2
312 movdqa 0x30(%eax), %xmm3
313 movdqa 0x40(%eax), %xmm4
314 movdqa 0x50(%eax), %xmm5
315 movdqa 0x60(%eax), %xmm6
316 movdqa 0x70(%eax), %xmm7
317 lea 0x80(%eax), %eax
318 sub $0x80, %ecx
319 movdqa %xmm0, (%edx)
320 movdqa %xmm1, 0x10(%edx)
321 movdqa %xmm2, 0x20(%edx)
322 movdqa %xmm3, 0x30(%edx)
323 movdqa %xmm4, 0x40(%edx)
324 movdqa %xmm5, 0x50(%edx)
325 movdqa %xmm6, 0x60(%edx)
326 movdqa %xmm7, 0x70(%edx)
327 lea 0x80(%edx), %edx
328
329 jae L(shl_0_gobble_mem_loop)
330 cmp $-0x40, %ecx
331 lea 0x80(%ecx), %ecx
332 jl L(shl_0_mem_less_64bytes)
333
334 movdqa (%eax), %xmm0
335 sub $0x40, %ecx
336 movdqa 0x10(%eax), %xmm1
337
338 movdqa %xmm0, (%edx)
339 movdqa %xmm1, 0x10(%edx)
340
341 movdqa 0x20(%eax), %xmm0
342 movdqa 0x30(%eax), %xmm1
343 add $0x40, %eax
344
345 movdqa %xmm0, 0x20(%edx)
346 movdqa %xmm1, 0x30(%edx)
347 add $0x40, %edx
348
349L(shl_0_mem_less_64bytes):
350 cmp $0x20, %ecx
351 jb L(shl_0_mem_less_32bytes)
352 movdqa (%eax), %xmm0
353 sub $0x20, %ecx
354 movdqa 0x10(%eax), %xmm1
355 add $0x20, %eax
356 movdqa %xmm0, (%edx)
357 movdqa %xmm1, 0x10(%edx)
358 add $0x20, %edx
359
360L(shl_0_mem_less_32bytes):
361 cmp $0x10, %ecx
362 jb L(shl_0_mem_less_16bytes)
363 sub $0x10, %ecx
364 movdqa (%eax), %xmm0
365 add $0x10, %eax
366 movdqa %xmm0, (%edx)
367 add $0x10, %edx
368
369L(shl_0_mem_less_16bytes):
370 add %ecx, %edx
371 add %ecx, %eax
372 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
373
374 .p2align 4
375L(shl_1):
376# ifndef USE_AS_MEMMOVE
377 movaps -1(%eax), %xmm1
378# else
379 movl DEST+4(%esp), %edi
380 movaps -1(%eax), %xmm1
381 movdqu %xmm0, (%edi)
382# endif
383# ifdef DATA_CACHE_SIZE_HALF
384 cmp $DATA_CACHE_SIZE_HALF, %ecx
385# else
386# ifdef PIC
387 SETUP_PIC_REG(bx)
388 add $_GLOBAL_OFFSET_TABLE_, %ebx
389 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
390# else
391 cmp __x86_data_cache_size_half, %ecx
392# endif
393# endif
394 jb L(sh_1_no_prefetch)
395
396 lea -64(%ecx), %ecx
397
398 .p2align 4
399L(Shl1LoopStart):
400 prefetcht0 0x1c0(%eax)
401 prefetcht0 0x1c0(%edx)
402 movaps 15(%eax), %xmm2
403 movaps 31(%eax), %xmm3
404 movaps 47(%eax), %xmm4
405 movaps 63(%eax), %xmm5
406 movaps %xmm5, %xmm7
407 palignr $1, %xmm4, %xmm5
408 palignr $1, %xmm3, %xmm4
409 movaps %xmm5, 48(%edx)
410 palignr $1, %xmm2, %xmm3
411 lea 64(%eax), %eax
412 palignr $1, %xmm1, %xmm2
413 movaps %xmm4, 32(%edx)
414 movaps %xmm3, 16(%edx)
415 movaps %xmm7, %xmm1
416 movaps %xmm2, (%edx)
417 lea 64(%edx), %edx
418 sub $64, %ecx
419 ja L(Shl1LoopStart)
420
421L(Shl1LoopLeave):
422 add $32, %ecx
423 jle L(shl_end_0)
424
425 movaps 15(%eax), %xmm2
426 movaps 31(%eax), %xmm3
427 palignr $1, %xmm2, %xmm3
428 palignr $1, %xmm1, %xmm2
429 movaps %xmm2, (%edx)
430 movaps %xmm3, 16(%edx)
431 lea 32(%edx, %ecx), %edx
432 lea 32(%eax, %ecx), %eax
433 POP (%edi)
434 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
435
436 CFI_PUSH (%edi)
437
438 .p2align 4
439L(sh_1_no_prefetch):
440 lea -32(%ecx), %ecx
441 lea -1(%eax), %eax
442 xor %edi, %edi
443
444 .p2align 4
445L(sh_1_no_prefetch_loop):
446 movdqa 16(%eax, %edi), %xmm2
447 sub $32, %ecx
448 movdqa 32(%eax, %edi), %xmm3
449 movdqa %xmm3, %xmm4
450 palignr $1, %xmm2, %xmm3
451 palignr $1, %xmm1, %xmm2
452 lea 32(%edi), %edi
453 movdqa %xmm2, -32(%edx, %edi)
454 movdqa %xmm3, -16(%edx, %edi)
455 jb L(sh_1_end_no_prefetch_loop)
456
457 movdqa 16(%eax, %edi), %xmm2
458 sub $32, %ecx
459 movdqa 32(%eax, %edi), %xmm3
460 movdqa %xmm3, %xmm1
461 palignr $1, %xmm2, %xmm3
462 palignr $1, %xmm4, %xmm2
463 lea 32(%edi), %edi
464 movdqa %xmm2, -32(%edx, %edi)
465 movdqa %xmm3, -16(%edx, %edi)
466 jae L(sh_1_no_prefetch_loop)
467
468L(sh_1_end_no_prefetch_loop):
469 lea 32(%ecx), %ecx
470 add %ecx, %edi
471 add %edi, %edx
472 lea 1(%edi, %eax), %eax
473 POP (%edi)
474 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
475
476 CFI_PUSH (%edi)
477
478 .p2align 4
479L(shl_2):
480# ifndef USE_AS_MEMMOVE
481 movaps -2(%eax), %xmm1
482# else
483 movl DEST+4(%esp), %edi
484 movaps -2(%eax), %xmm1
485 movdqu %xmm0, (%edi)
486# endif
487# ifdef DATA_CACHE_SIZE_HALF
488 cmp $DATA_CACHE_SIZE_HALF, %ecx
489# else
490# ifdef PIC
491 SETUP_PIC_REG(bx)
492 add $_GLOBAL_OFFSET_TABLE_, %ebx
493 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
494# else
495 cmp __x86_data_cache_size_half, %ecx
496# endif
497# endif
498 jb L(sh_2_no_prefetch)
499
500 lea -64(%ecx), %ecx
501
502 .p2align 4
503L(Shl2LoopStart):
504 prefetcht0 0x1c0(%eax)
505 prefetcht0 0x1c0(%edx)
506 movaps 14(%eax), %xmm2
507 movaps 30(%eax), %xmm3
508 movaps 46(%eax), %xmm4
509 movaps 62(%eax), %xmm5
510 movaps %xmm5, %xmm7
511 palignr $2, %xmm4, %xmm5
512 palignr $2, %xmm3, %xmm4
513 movaps %xmm5, 48(%edx)
514 palignr $2, %xmm2, %xmm3
515 lea 64(%eax), %eax
516 palignr $2, %xmm1, %xmm2
517 movaps %xmm4, 32(%edx)
518 movaps %xmm3, 16(%edx)
519 movaps %xmm7, %xmm1
520 movaps %xmm2, (%edx)
521 lea 64(%edx), %edx
522 sub $64, %ecx
523 ja L(Shl2LoopStart)
524
525L(Shl2LoopLeave):
526 add $32, %ecx
527 jle L(shl_end_0)
528
529 movaps 14(%eax), %xmm2
530 movaps 30(%eax), %xmm3
531 palignr $2, %xmm2, %xmm3
532 palignr $2, %xmm1, %xmm2
533 movaps %xmm2, (%edx)
534 movaps %xmm3, 16(%edx)
535 lea 32(%edx, %ecx), %edx
536 lea 32(%eax, %ecx), %eax
537 POP (%edi)
538 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
539
540 CFI_PUSH (%edi)
541
542 .p2align 4
543L(sh_2_no_prefetch):
544 lea -32(%ecx), %ecx
545 lea -2(%eax), %eax
546 xor %edi, %edi
547
548 .p2align 4
549L(sh_2_no_prefetch_loop):
550 movdqa 16(%eax, %edi), %xmm2
551 sub $32, %ecx
552 movdqa 32(%eax, %edi), %xmm3
553 movdqa %xmm3, %xmm4
554 palignr $2, %xmm2, %xmm3
555 palignr $2, %xmm1, %xmm2
556 lea 32(%edi), %edi
557 movdqa %xmm2, -32(%edx, %edi)
558 movdqa %xmm3, -16(%edx, %edi)
559 jb L(sh_2_end_no_prefetch_loop)
560
561 movdqa 16(%eax, %edi), %xmm2
562 sub $32, %ecx
563 movdqa 32(%eax, %edi), %xmm3
564 movdqa %xmm3, %xmm1
565 palignr $2, %xmm2, %xmm3
566 palignr $2, %xmm4, %xmm2
567 lea 32(%edi), %edi
568 movdqa %xmm2, -32(%edx, %edi)
569 movdqa %xmm3, -16(%edx, %edi)
570 jae L(sh_2_no_prefetch_loop)
571
572L(sh_2_end_no_prefetch_loop):
573 lea 32(%ecx), %ecx
574 add %ecx, %edi
575 add %edi, %edx
576 lea 2(%edi, %eax), %eax
577 POP (%edi)
578 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
579
580 CFI_PUSH (%edi)
581
582 .p2align 4
583L(shl_3):
584# ifndef USE_AS_MEMMOVE
585 movaps -3(%eax), %xmm1
586# else
587 movl DEST+4(%esp), %edi
588 movaps -3(%eax), %xmm1
589 movdqu %xmm0, (%edi)
590# endif
591# ifdef DATA_CACHE_SIZE_HALF
592 cmp $DATA_CACHE_SIZE_HALF, %ecx
593# else
594# ifdef PIC
595 SETUP_PIC_REG(bx)
596 add $_GLOBAL_OFFSET_TABLE_, %ebx
597 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
598# else
599 cmp __x86_data_cache_size_half, %ecx
600# endif
601# endif
602 jb L(sh_3_no_prefetch)
603
604 lea -64(%ecx), %ecx
605
606 .p2align 4
607L(Shl3LoopStart):
608 prefetcht0 0x1c0(%eax)
609 prefetcht0 0x1c0(%edx)
610 movaps 13(%eax), %xmm2
611 movaps 29(%eax), %xmm3
612 movaps 45(%eax), %xmm4
613 movaps 61(%eax), %xmm5
614 movaps %xmm5, %xmm7
615 palignr $3, %xmm4, %xmm5
616 palignr $3, %xmm3, %xmm4
617 movaps %xmm5, 48(%edx)
618 palignr $3, %xmm2, %xmm3
619 lea 64(%eax), %eax
620 palignr $3, %xmm1, %xmm2
621 movaps %xmm4, 32(%edx)
622 movaps %xmm3, 16(%edx)
623 movaps %xmm7, %xmm1
624 movaps %xmm2, (%edx)
625 lea 64(%edx), %edx
626 sub $64, %ecx
627 ja L(Shl3LoopStart)
628
629L(Shl3LoopLeave):
630 add $32, %ecx
631 jle L(shl_end_0)
632
633 movaps 13(%eax), %xmm2
634 movaps 29(%eax), %xmm3
635 palignr $3, %xmm2, %xmm3
636 palignr $3, %xmm1, %xmm2
637 movaps %xmm2, (%edx)
638 movaps %xmm3, 16(%edx)
639 lea 32(%edx, %ecx), %edx
640 lea 32(%eax, %ecx), %eax
641 POP (%edi)
642 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
643
644 CFI_PUSH (%edi)
645
646 .p2align 4
647L(sh_3_no_prefetch):
648 lea -32(%ecx), %ecx
649 lea -3(%eax), %eax
650 xor %edi, %edi
651
652 .p2align 4
653L(sh_3_no_prefetch_loop):
654 movdqa 16(%eax, %edi), %xmm2
655 sub $32, %ecx
656 movdqa 32(%eax, %edi), %xmm3
657 movdqa %xmm3, %xmm4
658 palignr $3, %xmm2, %xmm3
659 palignr $3, %xmm1, %xmm2
660 lea 32(%edi), %edi
661 movdqa %xmm2, -32(%edx, %edi)
662 movdqa %xmm3, -16(%edx, %edi)
663
664 jb L(sh_3_end_no_prefetch_loop)
665
666 movdqa 16(%eax, %edi), %xmm2
667 sub $32, %ecx
668 movdqa 32(%eax, %edi), %xmm3
669 movdqa %xmm3, %xmm1
670 palignr $3, %xmm2, %xmm3
671 palignr $3, %xmm4, %xmm2
672 lea 32(%edi), %edi
673 movdqa %xmm2, -32(%edx, %edi)
674 movdqa %xmm3, -16(%edx, %edi)
675
676 jae L(sh_3_no_prefetch_loop)
677
678L(sh_3_end_no_prefetch_loop):
679 lea 32(%ecx), %ecx
680 add %ecx, %edi
681 add %edi, %edx
682 lea 3(%edi, %eax), %eax
683 POP (%edi)
684 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
685
686 CFI_PUSH (%edi)
687
688 .p2align 4
689L(shl_4):
690# ifndef USE_AS_MEMMOVE
691 movaps -4(%eax), %xmm1
692# else
693 movl DEST+4(%esp), %edi
694 movaps -4(%eax), %xmm1
695 movdqu %xmm0, (%edi)
696# endif
697# ifdef DATA_CACHE_SIZE_HALF
698 cmp $DATA_CACHE_SIZE_HALF, %ecx
699# else
700# ifdef PIC
701 SETUP_PIC_REG(bx)
702 add $_GLOBAL_OFFSET_TABLE_, %ebx
703 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
704# else
705 cmp __x86_data_cache_size_half, %ecx
706# endif
707# endif
708 jb L(sh_4_no_prefetch)
709
710 lea -64(%ecx), %ecx
711
712 .p2align 4
713L(Shl4LoopStart):
714 prefetcht0 0x1c0(%eax)
715 prefetcht0 0x1c0(%edx)
716 movaps 12(%eax), %xmm2
717 movaps 28(%eax), %xmm3
718 movaps 44(%eax), %xmm4
719 movaps 60(%eax), %xmm5
720 movaps %xmm5, %xmm7
721 palignr $4, %xmm4, %xmm5
722 palignr $4, %xmm3, %xmm4
723 movaps %xmm5, 48(%edx)
724 palignr $4, %xmm2, %xmm3
725 lea 64(%eax), %eax
726 palignr $4, %xmm1, %xmm2
727 movaps %xmm4, 32(%edx)
728 movaps %xmm3, 16(%edx)
729 movaps %xmm7, %xmm1
730 movaps %xmm2, (%edx)
731 lea 64(%edx), %edx
732 sub $64, %ecx
733 ja L(Shl4LoopStart)
734
735L(Shl4LoopLeave):
736 add $32, %ecx
737 jle L(shl_end_0)
738
739 movaps 12(%eax), %xmm2
740 movaps 28(%eax), %xmm3
741 palignr $4, %xmm2, %xmm3
742 palignr $4, %xmm1, %xmm2
743 movaps %xmm2, (%edx)
744 movaps %xmm3, 16(%edx)
745 lea 32(%edx, %ecx), %edx
746 lea 32(%eax, %ecx), %eax
747 POP (%edi)
748 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
749
750 CFI_PUSH (%edi)
751
752 .p2align 4
753L(sh_4_no_prefetch):
754 lea -32(%ecx), %ecx
755 lea -4(%eax), %eax
756 xor %edi, %edi
757
758 .p2align 4
759L(sh_4_no_prefetch_loop):
760 movdqa 16(%eax, %edi), %xmm2
761 sub $32, %ecx
762 movdqa 32(%eax, %edi), %xmm3
763 movdqa %xmm3, %xmm4
764 palignr $4, %xmm2, %xmm3
765 palignr $4, %xmm1, %xmm2
766 lea 32(%edi), %edi
767 movdqa %xmm2, -32(%edx, %edi)
768 movdqa %xmm3, -16(%edx, %edi)
769
770 jb L(sh_4_end_no_prefetch_loop)
771
772 movdqa 16(%eax, %edi), %xmm2
773 sub $32, %ecx
774 movdqa 32(%eax, %edi), %xmm3
775 movdqa %xmm3, %xmm1
776 palignr $4, %xmm2, %xmm3
777 palignr $4, %xmm4, %xmm2
778 lea 32(%edi), %edi
779 movdqa %xmm2, -32(%edx, %edi)
780 movdqa %xmm3, -16(%edx, %edi)
781
782 jae L(sh_4_no_prefetch_loop)
783
784L(sh_4_end_no_prefetch_loop):
785 lea 32(%ecx), %ecx
786 add %ecx, %edi
787 add %edi, %edx
788 lea 4(%edi, %eax), %eax
789 POP (%edi)
790 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
791
792 CFI_PUSH (%edi)
793
794 .p2align 4
795L(shl_5):
796# ifndef USE_AS_MEMMOVE
797 movaps -5(%eax), %xmm1
798# else
799 movl DEST+4(%esp), %edi
800 movaps -5(%eax), %xmm1
801 movdqu %xmm0, (%edi)
802# endif
803# ifdef DATA_CACHE_SIZE_HALF
804 cmp $DATA_CACHE_SIZE_HALF, %ecx
805# else
806# ifdef PIC
807 SETUP_PIC_REG(bx)
808 add $_GLOBAL_OFFSET_TABLE_, %ebx
809 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
810# else
811 cmp __x86_data_cache_size_half, %ecx
812# endif
813# endif
814 jb L(sh_5_no_prefetch)
815
816 lea -64(%ecx), %ecx
817
818 .p2align 4
819L(Shl5LoopStart):
820 prefetcht0 0x1c0(%eax)
821 prefetcht0 0x1c0(%edx)
822 movaps 11(%eax), %xmm2
823 movaps 27(%eax), %xmm3
824 movaps 43(%eax), %xmm4
825 movaps 59(%eax), %xmm5
826 movaps %xmm5, %xmm7
827 palignr $5, %xmm4, %xmm5
828 palignr $5, %xmm3, %xmm4
829 movaps %xmm5, 48(%edx)
830 palignr $5, %xmm2, %xmm3
831 lea 64(%eax), %eax
832 palignr $5, %xmm1, %xmm2
833 movaps %xmm4, 32(%edx)
834 movaps %xmm3, 16(%edx)
835 movaps %xmm7, %xmm1
836 movaps %xmm2, (%edx)
837 lea 64(%edx), %edx
838 sub $64, %ecx
839 ja L(Shl5LoopStart)
840
841L(Shl5LoopLeave):
842 add $32, %ecx
843 jle L(shl_end_0)
844
845 movaps 11(%eax), %xmm2
846 movaps 27(%eax), %xmm3
847 palignr $5, %xmm2, %xmm3
848 palignr $5, %xmm1, %xmm2
849 movaps %xmm2, (%edx)
850 movaps %xmm3, 16(%edx)
851 lea 32(%edx, %ecx), %edx
852 lea 32(%eax, %ecx), %eax
853 POP (%edi)
854 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
855
856 CFI_PUSH (%edi)
857
858 .p2align 4
859L(sh_5_no_prefetch):
860 lea -32(%ecx), %ecx
861 lea -5(%eax), %eax
862 xor %edi, %edi
863
864 .p2align 4
865L(sh_5_no_prefetch_loop):
866 movdqa 16(%eax, %edi), %xmm2
867 sub $32, %ecx
868 movdqa 32(%eax, %edi), %xmm3
869 movdqa %xmm3, %xmm4
870 palignr $5, %xmm2, %xmm3
871 palignr $5, %xmm1, %xmm2
872 lea 32(%edi), %edi
873 movdqa %xmm2, -32(%edx, %edi)
874 movdqa %xmm3, -16(%edx, %edi)
875
876 jb L(sh_5_end_no_prefetch_loop)
877
878 movdqa 16(%eax, %edi), %xmm2
879 sub $32, %ecx
880 movdqa 32(%eax, %edi), %xmm3
881 movdqa %xmm3, %xmm1
882 palignr $5, %xmm2, %xmm3
883 palignr $5, %xmm4, %xmm2
884 lea 32(%edi), %edi
885 movdqa %xmm2, -32(%edx, %edi)
886 movdqa %xmm3, -16(%edx, %edi)
887
888 jae L(sh_5_no_prefetch_loop)
889
890L(sh_5_end_no_prefetch_loop):
891 lea 32(%ecx), %ecx
892 add %ecx, %edi
893 add %edi, %edx
894 lea 5(%edi, %eax), %eax
895 POP (%edi)
896 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
897
898 CFI_PUSH (%edi)
899
900 .p2align 4
901L(shl_6):
902# ifndef USE_AS_MEMMOVE
903 movaps -6(%eax), %xmm1
904# else
905 movl DEST+4(%esp), %edi
906 movaps -6(%eax), %xmm1
907 movdqu %xmm0, (%edi)
908# endif
909# ifdef DATA_CACHE_SIZE_HALF
910 cmp $DATA_CACHE_SIZE_HALF, %ecx
911# else
912# ifdef PIC
913 SETUP_PIC_REG(bx)
914 add $_GLOBAL_OFFSET_TABLE_, %ebx
915 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
916# else
917 cmp __x86_data_cache_size_half, %ecx
918# endif
919# endif
920 jb L(sh_6_no_prefetch)
921
922 lea -64(%ecx), %ecx
923
924 .p2align 4
925L(Shl6LoopStart):
926 prefetcht0 0x1c0(%eax)
927 prefetcht0 0x1c0(%edx)
928 movaps 10(%eax), %xmm2
929 movaps 26(%eax), %xmm3
930 movaps 42(%eax), %xmm4
931 movaps 58(%eax), %xmm5
932 movaps %xmm5, %xmm7
933 palignr $6, %xmm4, %xmm5
934 palignr $6, %xmm3, %xmm4
935 movaps %xmm5, 48(%edx)
936 palignr $6, %xmm2, %xmm3
937 lea 64(%eax), %eax
938 palignr $6, %xmm1, %xmm2
939 movaps %xmm4, 32(%edx)
940 movaps %xmm3, 16(%edx)
941 movaps %xmm7, %xmm1
942 movaps %xmm2, (%edx)
943 lea 64(%edx), %edx
944 sub $64, %ecx
945 ja L(Shl6LoopStart)
946
947L(Shl6LoopLeave):
948 add $32, %ecx
949 jle L(shl_end_0)
950
951 movaps 10(%eax), %xmm2
952 movaps 26(%eax), %xmm3
953 palignr $6, %xmm2, %xmm3
954 palignr $6, %xmm1, %xmm2
955 movaps %xmm2, (%edx)
956 movaps %xmm3, 16(%edx)
957 lea 32(%edx, %ecx), %edx
958 lea 32(%eax, %ecx), %eax
959 POP (%edi)
960 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
961
962 CFI_PUSH (%edi)
963
964 .p2align 4
965L(sh_6_no_prefetch):
966 lea -32(%ecx), %ecx
967 lea -6(%eax), %eax
968 xor %edi, %edi
969
970 .p2align 4
971L(sh_6_no_prefetch_loop):
972 movdqa 16(%eax, %edi), %xmm2
973 sub $32, %ecx
974 movdqa 32(%eax, %edi), %xmm3
975 movdqa %xmm3, %xmm4
976 palignr $6, %xmm2, %xmm3
977 palignr $6, %xmm1, %xmm2
978 lea 32(%edi), %edi
979 movdqa %xmm2, -32(%edx, %edi)
980 movdqa %xmm3, -16(%edx, %edi)
981
982 jb L(sh_6_end_no_prefetch_loop)
983
984 movdqa 16(%eax, %edi), %xmm2
985 sub $32, %ecx
986 movdqa 32(%eax, %edi), %xmm3
987 movdqa %xmm3, %xmm1
988 palignr $6, %xmm2, %xmm3
989 palignr $6, %xmm4, %xmm2
990 lea 32(%edi), %edi
991 movdqa %xmm2, -32(%edx, %edi)
992 movdqa %xmm3, -16(%edx, %edi)
993
994 jae L(sh_6_no_prefetch_loop)
995
996L(sh_6_end_no_prefetch_loop):
997 lea 32(%ecx), %ecx
998 add %ecx, %edi
999 add %edi, %edx
1000 lea 6(%edi, %eax), %eax
1001 POP (%edi)
1002 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1003
1004 CFI_PUSH (%edi)
1005
1006 .p2align 4
1007L(shl_7):
1008# ifndef USE_AS_MEMMOVE
1009 movaps -7(%eax), %xmm1
1010# else
1011 movl DEST+4(%esp), %edi
1012 movaps -7(%eax), %xmm1
1013 movdqu %xmm0, (%edi)
1014# endif
1015# ifdef DATA_CACHE_SIZE_HALF
1016 cmp $DATA_CACHE_SIZE_HALF, %ecx
1017# else
1018# ifdef PIC
1019 SETUP_PIC_REG(bx)
1020 add $_GLOBAL_OFFSET_TABLE_, %ebx
1021 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1022# else
1023 cmp __x86_data_cache_size_half, %ecx
1024# endif
1025# endif
1026 jb L(sh_7_no_prefetch)
1027
1028 lea -64(%ecx), %ecx
1029
1030 .p2align 4
1031L(Shl7LoopStart):
1032 prefetcht0 0x1c0(%eax)
1033 prefetcht0 0x1c0(%edx)
1034 movaps 9(%eax), %xmm2
1035 movaps 25(%eax), %xmm3
1036 movaps 41(%eax), %xmm4
1037 movaps 57(%eax), %xmm5
1038 movaps %xmm5, %xmm7
1039 palignr $7, %xmm4, %xmm5
1040 palignr $7, %xmm3, %xmm4
1041 movaps %xmm5, 48(%edx)
1042 palignr $7, %xmm2, %xmm3
1043 lea 64(%eax), %eax
1044 palignr $7, %xmm1, %xmm2
1045 movaps %xmm4, 32(%edx)
1046 movaps %xmm3, 16(%edx)
1047 movaps %xmm7, %xmm1
1048 movaps %xmm2, (%edx)
1049 lea 64(%edx), %edx
1050 sub $64, %ecx
1051 ja L(Shl7LoopStart)
1052
1053L(Shl7LoopLeave):
1054 add $32, %ecx
1055 jle L(shl_end_0)
1056
1057 movaps 9(%eax), %xmm2
1058 movaps 25(%eax), %xmm3
1059 palignr $7, %xmm2, %xmm3
1060 palignr $7, %xmm1, %xmm2
1061 movaps %xmm2, (%edx)
1062 movaps %xmm3, 16(%edx)
1063 lea 32(%edx, %ecx), %edx
1064 lea 32(%eax, %ecx), %eax
1065 POP (%edi)
1066 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1067
1068 CFI_PUSH (%edi)
1069
1070 .p2align 4
1071L(sh_7_no_prefetch):
1072 lea -32(%ecx), %ecx
1073 lea -7(%eax), %eax
1074 xor %edi, %edi
1075
1076 .p2align 4
1077L(sh_7_no_prefetch_loop):
1078 movdqa 16(%eax, %edi), %xmm2
1079 sub $32, %ecx
1080 movdqa 32(%eax, %edi), %xmm3
1081 movdqa %xmm3, %xmm4
1082 palignr $7, %xmm2, %xmm3
1083 palignr $7, %xmm1, %xmm2
1084 lea 32(%edi), %edi
1085 movdqa %xmm2, -32(%edx, %edi)
1086 movdqa %xmm3, -16(%edx, %edi)
1087 jb L(sh_7_end_no_prefetch_loop)
1088
1089 movdqa 16(%eax, %edi), %xmm2
1090 sub $32, %ecx
1091 movdqa 32(%eax, %edi), %xmm3
1092 movdqa %xmm3, %xmm1
1093 palignr $7, %xmm2, %xmm3
1094 palignr $7, %xmm4, %xmm2
1095 lea 32(%edi), %edi
1096 movdqa %xmm2, -32(%edx, %edi)
1097 movdqa %xmm3, -16(%edx, %edi)
1098 jae L(sh_7_no_prefetch_loop)
1099
1100L(sh_7_end_no_prefetch_loop):
1101 lea 32(%ecx), %ecx
1102 add %ecx, %edi
1103 add %edi, %edx
1104 lea 7(%edi, %eax), %eax
1105 POP (%edi)
1106 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1107
1108 CFI_PUSH (%edi)
1109
1110 .p2align 4
1111L(shl_8):
1112# ifndef USE_AS_MEMMOVE
1113 movaps -8(%eax), %xmm1
1114# else
1115 movl DEST+4(%esp), %edi
1116 movaps -8(%eax), %xmm1
1117 movdqu %xmm0, (%edi)
1118# endif
1119# ifdef DATA_CACHE_SIZE_HALF
1120 cmp $DATA_CACHE_SIZE_HALF, %ecx
1121# else
1122# ifdef PIC
1123 SETUP_PIC_REG(bx)
1124 add $_GLOBAL_OFFSET_TABLE_, %ebx
1125 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1126# else
1127 cmp __x86_data_cache_size_half, %ecx
1128# endif
1129# endif
1130 jb L(sh_8_no_prefetch)
1131
1132 lea -64(%ecx), %ecx
1133
1134 .p2align 4
1135L(Shl8LoopStart):
1136 prefetcht0 0x1c0(%eax)
1137 prefetcht0 0x1c0(%edx)
1138 movaps 8(%eax), %xmm2
1139 movaps 24(%eax), %xmm3
1140 movaps 40(%eax), %xmm4
1141 movaps 56(%eax), %xmm5
1142 movaps %xmm5, %xmm7
1143 palignr $8, %xmm4, %xmm5
1144 palignr $8, %xmm3, %xmm4
1145 movaps %xmm5, 48(%edx)
1146 palignr $8, %xmm2, %xmm3
1147 lea 64(%eax), %eax
1148 palignr $8, %xmm1, %xmm2
1149 movaps %xmm4, 32(%edx)
1150 movaps %xmm3, 16(%edx)
1151 movaps %xmm7, %xmm1
1152 movaps %xmm2, (%edx)
1153 lea 64(%edx), %edx
1154 sub $64, %ecx
1155 ja L(Shl8LoopStart)
1156
1157L(LoopLeave8):
1158 add $32, %ecx
1159 jle L(shl_end_0)
1160
1161 movaps 8(%eax), %xmm2
1162 movaps 24(%eax), %xmm3
1163 palignr $8, %xmm2, %xmm3
1164 palignr $8, %xmm1, %xmm2
1165 movaps %xmm2, (%edx)
1166 movaps %xmm3, 16(%edx)
1167 lea 32(%edx, %ecx), %edx
1168 lea 32(%eax, %ecx), %eax
1169 POP (%edi)
1170 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1171
1172 CFI_PUSH (%edi)
1173
1174 .p2align 4
1175L(sh_8_no_prefetch):
1176 lea -32(%ecx), %ecx
1177 lea -8(%eax), %eax
1178 xor %edi, %edi
1179
1180 .p2align 4
1181L(sh_8_no_prefetch_loop):
1182 movdqa 16(%eax, %edi), %xmm2
1183 sub $32, %ecx
1184 movdqa 32(%eax, %edi), %xmm3
1185 movdqa %xmm3, %xmm4
1186 palignr $8, %xmm2, %xmm3
1187 palignr $8, %xmm1, %xmm2
1188 lea 32(%edi), %edi
1189 movdqa %xmm2, -32(%edx, %edi)
1190 movdqa %xmm3, -16(%edx, %edi)
1191 jb L(sh_8_end_no_prefetch_loop)
1192
1193 movdqa 16(%eax, %edi), %xmm2
1194 sub $32, %ecx
1195 movdqa 32(%eax, %edi), %xmm3
1196 movdqa %xmm3, %xmm1
1197 palignr $8, %xmm2, %xmm3
1198 palignr $8, %xmm4, %xmm2
1199 lea 32(%edi), %edi
1200 movdqa %xmm2, -32(%edx, %edi)
1201 movdqa %xmm3, -16(%edx, %edi)
1202 jae L(sh_8_no_prefetch_loop)
1203
1204L(sh_8_end_no_prefetch_loop):
1205 lea 32(%ecx), %ecx
1206 add %ecx, %edi
1207 add %edi, %edx
1208 lea 8(%edi, %eax), %eax
1209 POP (%edi)
1210 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1211
1212 CFI_PUSH (%edi)
1213
1214 .p2align 4
1215L(shl_9):
1216# ifndef USE_AS_MEMMOVE
1217 movaps -9(%eax), %xmm1
1218# else
1219 movl DEST+4(%esp), %edi
1220 movaps -9(%eax), %xmm1
1221 movdqu %xmm0, (%edi)
1222# endif
1223# ifdef DATA_CACHE_SIZE_HALF
1224 cmp $DATA_CACHE_SIZE_HALF, %ecx
1225# else
1226# ifdef PIC
1227 SETUP_PIC_REG(bx)
1228 add $_GLOBAL_OFFSET_TABLE_, %ebx
1229 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1230# else
1231 cmp __x86_data_cache_size_half, %ecx
1232# endif
1233# endif
1234 jb L(sh_9_no_prefetch)
1235
1236 lea -64(%ecx), %ecx
1237
1238 .p2align 4
1239L(Shl9LoopStart):
1240 prefetcht0 0x1c0(%eax)
1241 prefetcht0 0x1c0(%edx)
1242 movaps 7(%eax), %xmm2
1243 movaps 23(%eax), %xmm3
1244 movaps 39(%eax), %xmm4
1245 movaps 55(%eax), %xmm5
1246 movaps %xmm5, %xmm7
1247 palignr $9, %xmm4, %xmm5
1248 palignr $9, %xmm3, %xmm4
1249 movaps %xmm5, 48(%edx)
1250 palignr $9, %xmm2, %xmm3
1251 lea 64(%eax), %eax
1252 palignr $9, %xmm1, %xmm2
1253 movaps %xmm4, 32(%edx)
1254 movaps %xmm3, 16(%edx)
1255 movaps %xmm7, %xmm1
1256 movaps %xmm2, (%edx)
1257 lea 64(%edx), %edx
1258 sub $64, %ecx
1259 ja L(Shl9LoopStart)
1260
1261L(Shl9LoopLeave):
1262 add $32, %ecx
1263 jle L(shl_end_0)
1264
1265 movaps 7(%eax), %xmm2
1266 movaps 23(%eax), %xmm3
1267 palignr $9, %xmm2, %xmm3
1268 palignr $9, %xmm1, %xmm2
1269
1270 movaps %xmm2, (%edx)
1271 movaps %xmm3, 16(%edx)
1272 lea 32(%edx, %ecx), %edx
1273 lea 32(%eax, %ecx), %eax
1274 POP (%edi)
1275 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1276
1277 CFI_PUSH (%edi)
1278
1279 .p2align 4
1280L(sh_9_no_prefetch):
1281 lea -32(%ecx), %ecx
1282 lea -9(%eax), %eax
1283 xor %edi, %edi
1284
1285 .p2align 4
1286L(sh_9_no_prefetch_loop):
1287 movdqa 16(%eax, %edi), %xmm2
1288 sub $32, %ecx
1289 movdqa 32(%eax, %edi), %xmm3
1290 movdqa %xmm3, %xmm4
1291 palignr $9, %xmm2, %xmm3
1292 palignr $9, %xmm1, %xmm2
1293 lea 32(%edi), %edi
1294 movdqa %xmm2, -32(%edx, %edi)
1295 movdqa %xmm3, -16(%edx, %edi)
1296 jb L(sh_9_end_no_prefetch_loop)
1297
1298 movdqa 16(%eax, %edi), %xmm2
1299 sub $32, %ecx
1300 movdqa 32(%eax, %edi), %xmm3
1301 movdqa %xmm3, %xmm1
1302 palignr $9, %xmm2, %xmm3
1303 palignr $9, %xmm4, %xmm2
1304 lea 32(%edi), %edi
1305 movdqa %xmm2, -32(%edx, %edi)
1306 movdqa %xmm3, -16(%edx, %edi)
1307 jae L(sh_9_no_prefetch_loop)
1308
1309L(sh_9_end_no_prefetch_loop):
1310 lea 32(%ecx), %ecx
1311 add %ecx, %edi
1312 add %edi, %edx
1313 lea 9(%edi, %eax), %eax
1314 POP (%edi)
1315 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1316
1317 CFI_PUSH (%edi)
1318
1319 .p2align 4
1320L(shl_10):
1321# ifndef USE_AS_MEMMOVE
1322 movaps -10(%eax), %xmm1
1323# else
1324 movl DEST+4(%esp), %edi
1325 movaps -10(%eax), %xmm1
1326 movdqu %xmm0, (%edi)
1327# endif
1328# ifdef DATA_CACHE_SIZE_HALF
1329 cmp $DATA_CACHE_SIZE_HALF, %ecx
1330# else
1331# ifdef PIC
1332 SETUP_PIC_REG(bx)
1333 add $_GLOBAL_OFFSET_TABLE_, %ebx
1334 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1335# else
1336 cmp __x86_data_cache_size_half, %ecx
1337# endif
1338# endif
1339 jb L(sh_10_no_prefetch)
1340
1341 lea -64(%ecx), %ecx
1342
1343 .p2align 4
1344L(Shl10LoopStart):
1345 prefetcht0 0x1c0(%eax)
1346 prefetcht0 0x1c0(%edx)
1347 movaps 6(%eax), %xmm2
1348 movaps 22(%eax), %xmm3
1349 movaps 38(%eax), %xmm4
1350 movaps 54(%eax), %xmm5
1351 movaps %xmm5, %xmm7
1352 palignr $10, %xmm4, %xmm5
1353 palignr $10, %xmm3, %xmm4
1354 movaps %xmm5, 48(%edx)
1355 palignr $10, %xmm2, %xmm3
1356 lea 64(%eax), %eax
1357 palignr $10, %xmm1, %xmm2
1358 movaps %xmm4, 32(%edx)
1359 movaps %xmm3, 16(%edx)
1360 movaps %xmm7, %xmm1
1361 movaps %xmm2, (%edx)
1362 lea 64(%edx), %edx
1363 sub $64, %ecx
1364 ja L(Shl10LoopStart)
1365
1366L(Shl10LoopLeave):
1367 add $32, %ecx
1368 jle L(shl_end_0)
1369
1370 movaps 6(%eax), %xmm2
1371 movaps 22(%eax), %xmm3
1372 palignr $10, %xmm2, %xmm3
1373 palignr $10, %xmm1, %xmm2
1374
1375 movaps %xmm2, (%edx)
1376 movaps %xmm3, 16(%edx)
1377 lea 32(%edx, %ecx), %edx
1378 lea 32(%eax, %ecx), %eax
1379 POP (%edi)
1380 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1381
1382 CFI_PUSH (%edi)
1383
1384 .p2align 4
1385L(sh_10_no_prefetch):
1386 lea -32(%ecx), %ecx
1387 lea -10(%eax), %eax
1388 xor %edi, %edi
1389
1390 .p2align 4
1391L(sh_10_no_prefetch_loop):
1392 movdqa 16(%eax, %edi), %xmm2
1393 sub $32, %ecx
1394 movdqa 32(%eax, %edi), %xmm3
1395 movdqa %xmm3, %xmm4
1396 palignr $10, %xmm2, %xmm3
1397 palignr $10, %xmm1, %xmm2
1398 lea 32(%edi), %edi
1399 movdqa %xmm2, -32(%edx, %edi)
1400 movdqa %xmm3, -16(%edx, %edi)
1401 jb L(sh_10_end_no_prefetch_loop)
1402
1403 movdqa 16(%eax, %edi), %xmm2
1404 sub $32, %ecx
1405 movdqa 32(%eax, %edi), %xmm3
1406 movdqa %xmm3, %xmm1
1407 palignr $10, %xmm2, %xmm3
1408 palignr $10, %xmm4, %xmm2
1409 lea 32(%edi), %edi
1410 movdqa %xmm2, -32(%edx, %edi)
1411 movdqa %xmm3, -16(%edx, %edi)
1412 jae L(sh_10_no_prefetch_loop)
1413
1414L(sh_10_end_no_prefetch_loop):
1415 lea 32(%ecx), %ecx
1416 add %ecx, %edi
1417 add %edi, %edx
1418 lea 10(%edi, %eax), %eax
1419 POP (%edi)
1420 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1421
1422 CFI_PUSH (%edi)
1423
1424 .p2align 4
1425L(shl_11):
1426# ifndef USE_AS_MEMMOVE
1427 movaps -11(%eax), %xmm1
1428# else
1429 movl DEST+4(%esp), %edi
1430 movaps -11(%eax), %xmm1
1431 movdqu %xmm0, (%edi)
1432# endif
1433# ifdef DATA_CACHE_SIZE_HALF
1434 cmp $DATA_CACHE_SIZE_HALF, %ecx
1435# else
1436# ifdef PIC
1437 SETUP_PIC_REG(bx)
1438 add $_GLOBAL_OFFSET_TABLE_, %ebx
1439 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1440# else
1441 cmp __x86_data_cache_size_half, %ecx
1442# endif
1443# endif
1444 jb L(sh_11_no_prefetch)
1445
1446 lea -64(%ecx), %ecx
1447
1448 .p2align 4
1449L(Shl11LoopStart):
1450 prefetcht0 0x1c0(%eax)
1451 prefetcht0 0x1c0(%edx)
1452 movaps 5(%eax), %xmm2
1453 movaps 21(%eax), %xmm3
1454 movaps 37(%eax), %xmm4
1455 movaps 53(%eax), %xmm5
1456 movaps %xmm5, %xmm7
1457 palignr $11, %xmm4, %xmm5
1458 palignr $11, %xmm3, %xmm4
1459 movaps %xmm5, 48(%edx)
1460 palignr $11, %xmm2, %xmm3
1461 lea 64(%eax), %eax
1462 palignr $11, %xmm1, %xmm2
1463 movaps %xmm4, 32(%edx)
1464 movaps %xmm3, 16(%edx)
1465 movaps %xmm7, %xmm1
1466 movaps %xmm2, (%edx)
1467 lea 64(%edx), %edx
1468 sub $64, %ecx
1469 ja L(Shl11LoopStart)
1470
1471L(Shl11LoopLeave):
1472 add $32, %ecx
1473 jle L(shl_end_0)
1474
1475 movaps 5(%eax), %xmm2
1476 movaps 21(%eax), %xmm3
1477 palignr $11, %xmm2, %xmm3
1478 palignr $11, %xmm1, %xmm2
1479
1480 movaps %xmm2, (%edx)
1481 movaps %xmm3, 16(%edx)
1482 lea 32(%edx, %ecx), %edx
1483 lea 32(%eax, %ecx), %eax
1484 POP (%edi)
1485 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1486
1487 CFI_PUSH (%edi)
1488
1489 .p2align 4
1490L(sh_11_no_prefetch):
1491 lea -32(%ecx), %ecx
1492 lea -11(%eax), %eax
1493 xor %edi, %edi
1494
1495 .p2align 4
1496L(sh_11_no_prefetch_loop):
1497 movdqa 16(%eax, %edi), %xmm2
1498 sub $32, %ecx
1499 movdqa 32(%eax, %edi), %xmm3
1500 movdqa %xmm3, %xmm4
1501 palignr $11, %xmm2, %xmm3
1502 palignr $11, %xmm1, %xmm2
1503 lea 32(%edi), %edi
1504 movdqa %xmm2, -32(%edx, %edi)
1505 movdqa %xmm3, -16(%edx, %edi)
1506 jb L(sh_11_end_no_prefetch_loop)
1507
1508 movdqa 16(%eax, %edi), %xmm2
1509 sub $32, %ecx
1510 movdqa 32(%eax, %edi), %xmm3
1511 movdqa %xmm3, %xmm1
1512 palignr $11, %xmm2, %xmm3
1513 palignr $11, %xmm4, %xmm2
1514 lea 32(%edi), %edi
1515 movdqa %xmm2, -32(%edx, %edi)
1516 movdqa %xmm3, -16(%edx, %edi)
1517 jae L(sh_11_no_prefetch_loop)
1518
1519L(sh_11_end_no_prefetch_loop):
1520 lea 32(%ecx), %ecx
1521 add %ecx, %edi
1522 add %edi, %edx
1523 lea 11(%edi, %eax), %eax
1524 POP (%edi)
1525 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1526
1527 CFI_PUSH (%edi)
1528
1529 .p2align 4
1530L(shl_12):
1531# ifndef USE_AS_MEMMOVE
1532 movaps -12(%eax), %xmm1
1533# else
1534 movl DEST+4(%esp), %edi
1535 movaps -12(%eax), %xmm1
1536 movdqu %xmm0, (%edi)
1537# endif
1538# ifdef DATA_CACHE_SIZE_HALF
1539 cmp $DATA_CACHE_SIZE_HALF, %ecx
1540# else
1541# ifdef PIC
1542 SETUP_PIC_REG(bx)
1543 add $_GLOBAL_OFFSET_TABLE_, %ebx
1544 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1545# else
1546 cmp __x86_data_cache_size_half, %ecx
1547# endif
1548# endif
1549 jb L(sh_12_no_prefetch)
1550
1551 lea -64(%ecx), %ecx
1552
1553 .p2align 4
1554L(Shl12LoopStart):
1555 prefetcht0 0x1c0(%eax)
1556 prefetcht0 0x1c0(%edx)
1557 movaps 4(%eax), %xmm2
1558 movaps 20(%eax), %xmm3
1559 movaps 36(%eax), %xmm4
1560 movaps 52(%eax), %xmm5
1561 movaps %xmm5, %xmm7
1562 palignr $12, %xmm4, %xmm5
1563 palignr $12, %xmm3, %xmm4
1564 movaps %xmm5, 48(%edx)
1565 palignr $12, %xmm2, %xmm3
1566 lea 64(%eax), %eax
1567 palignr $12, %xmm1, %xmm2
1568 movaps %xmm4, 32(%edx)
1569 movaps %xmm3, 16(%edx)
1570 movaps %xmm7, %xmm1
1571 movaps %xmm2, (%edx)
1572 lea 64(%edx), %edx
1573 sub $64, %ecx
1574 ja L(Shl12LoopStart)
1575
1576L(Shl12LoopLeave):
1577 add $32, %ecx
1578 jle L(shl_end_0)
1579
1580 movaps 4(%eax), %xmm2
1581 movaps 20(%eax), %xmm3
1582 palignr $12, %xmm2, %xmm3
1583 palignr $12, %xmm1, %xmm2
1584
1585 movaps %xmm2, (%edx)
1586 movaps %xmm3, 16(%edx)
1587 lea 32(%edx, %ecx), %edx
1588 lea 32(%eax, %ecx), %eax
1589 POP (%edi)
1590 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1591
1592 CFI_PUSH (%edi)
1593
1594 .p2align 4
1595L(sh_12_no_prefetch):
1596 lea -32(%ecx), %ecx
1597 lea -12(%eax), %eax
1598 xor %edi, %edi
1599
1600 .p2align 4
1601L(sh_12_no_prefetch_loop):
1602 movdqa 16(%eax, %edi), %xmm2
1603 sub $32, %ecx
1604 movdqa 32(%eax, %edi), %xmm3
1605 movdqa %xmm3, %xmm4
1606 palignr $12, %xmm2, %xmm3
1607 palignr $12, %xmm1, %xmm2
1608 lea 32(%edi), %edi
1609 movdqa %xmm2, -32(%edx, %edi)
1610 movdqa %xmm3, -16(%edx, %edi)
1611 jb L(sh_12_end_no_prefetch_loop)
1612
1613 movdqa 16(%eax, %edi), %xmm2
1614 sub $32, %ecx
1615 movdqa 32(%eax, %edi), %xmm3
1616 movdqa %xmm3, %xmm1
1617 palignr $12, %xmm2, %xmm3
1618 palignr $12, %xmm4, %xmm2
1619 lea 32(%edi), %edi
1620 movdqa %xmm2, -32(%edx, %edi)
1621 movdqa %xmm3, -16(%edx, %edi)
1622 jae L(sh_12_no_prefetch_loop)
1623
1624L(sh_12_end_no_prefetch_loop):
1625 lea 32(%ecx), %ecx
1626 add %ecx, %edi
1627 add %edi, %edx
1628 lea 12(%edi, %eax), %eax
1629 POP (%edi)
1630 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1631
1632 CFI_PUSH (%edi)
1633
1634 .p2align 4
1635L(shl_13):
1636# ifndef USE_AS_MEMMOVE
1637 movaps -13(%eax), %xmm1
1638# else
1639 movl DEST+4(%esp), %edi
1640 movaps -13(%eax), %xmm1
1641 movdqu %xmm0, (%edi)
1642# endif
1643# ifdef DATA_CACHE_SIZE_HALF
1644 cmp $DATA_CACHE_SIZE_HALF, %ecx
1645# else
1646# ifdef PIC
1647 SETUP_PIC_REG(bx)
1648 add $_GLOBAL_OFFSET_TABLE_, %ebx
1649 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1650# else
1651 cmp __x86_data_cache_size_half, %ecx
1652# endif
1653# endif
1654 jb L(sh_13_no_prefetch)
1655
1656 lea -64(%ecx), %ecx
1657
1658 .p2align 4
1659L(Shl13LoopStart):
1660 prefetcht0 0x1c0(%eax)
1661 prefetcht0 0x1c0(%edx)
1662 movaps 3(%eax), %xmm2
1663 movaps 19(%eax), %xmm3
1664 movaps 35(%eax), %xmm4
1665 movaps 51(%eax), %xmm5
1666 movaps %xmm5, %xmm7
1667 palignr $13, %xmm4, %xmm5
1668 palignr $13, %xmm3, %xmm4
1669 movaps %xmm5, 48(%edx)
1670 palignr $13, %xmm2, %xmm3
1671 lea 64(%eax), %eax
1672 palignr $13, %xmm1, %xmm2
1673 movaps %xmm4, 32(%edx)
1674 movaps %xmm3, 16(%edx)
1675 movaps %xmm7, %xmm1
1676 movaps %xmm2, (%edx)
1677 lea 64(%edx), %edx
1678 sub $64, %ecx
1679 ja L(Shl13LoopStart)
1680
1681L(Shl13LoopLeave):
1682 add $32, %ecx
1683 jle L(shl_end_0)
1684
1685 movaps 3(%eax), %xmm2
1686 movaps 19(%eax), %xmm3
1687 palignr $13, %xmm2, %xmm3
1688 palignr $13, %xmm1, %xmm2
1689
1690 movaps %xmm2, (%edx)
1691 movaps %xmm3, 16(%edx)
1692 lea 32(%edx, %ecx), %edx
1693 lea 32(%eax, %ecx), %eax
1694 POP (%edi)
1695 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1696
1697 CFI_PUSH (%edi)
1698
1699 .p2align 4
1700L(sh_13_no_prefetch):
1701 lea -32(%ecx), %ecx
1702 lea -13(%eax), %eax
1703 xor %edi, %edi
1704
1705 .p2align 4
1706L(sh_13_no_prefetch_loop):
1707 movdqa 16(%eax, %edi), %xmm2
1708 sub $32, %ecx
1709 movdqa 32(%eax, %edi), %xmm3
1710 movdqa %xmm3, %xmm4
1711 palignr $13, %xmm2, %xmm3
1712 palignr $13, %xmm1, %xmm2
1713 lea 32(%edi), %edi
1714 movdqa %xmm2, -32(%edx, %edi)
1715 movdqa %xmm3, -16(%edx, %edi)
1716 jb L(sh_13_end_no_prefetch_loop)
1717
1718 movdqa 16(%eax, %edi), %xmm2
1719 sub $32, %ecx
1720 movdqa 32(%eax, %edi), %xmm3
1721 movdqa %xmm3, %xmm1
1722 palignr $13, %xmm2, %xmm3
1723 palignr $13, %xmm4, %xmm2
1724 lea 32(%edi), %edi
1725 movdqa %xmm2, -32(%edx, %edi)
1726 movdqa %xmm3, -16(%edx, %edi)
1727 jae L(sh_13_no_prefetch_loop)
1728
1729L(sh_13_end_no_prefetch_loop):
1730 lea 32(%ecx), %ecx
1731 add %ecx, %edi
1732 add %edi, %edx
1733 lea 13(%edi, %eax), %eax
1734 POP (%edi)
1735 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1736
1737 CFI_PUSH (%edi)
1738
1739 .p2align 4
1740L(shl_14):
1741# ifndef USE_AS_MEMMOVE
1742 movaps -14(%eax), %xmm1
1743# else
1744 movl DEST+4(%esp), %edi
1745 movaps -14(%eax), %xmm1
1746 movdqu %xmm0, (%edi)
1747# endif
1748# ifdef DATA_CACHE_SIZE_HALF
1749 cmp $DATA_CACHE_SIZE_HALF, %ecx
1750# else
1751# ifdef PIC
1752 SETUP_PIC_REG(bx)
1753 add $_GLOBAL_OFFSET_TABLE_, %ebx
1754 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1755# else
1756 cmp __x86_data_cache_size_half, %ecx
1757# endif
1758# endif
1759 jb L(sh_14_no_prefetch)
1760
1761 lea -64(%ecx), %ecx
1762
1763 .p2align 4
1764L(Shl14LoopStart):
1765 prefetcht0 0x1c0(%eax)
1766 prefetcht0 0x1c0(%edx)
1767 movaps 2(%eax), %xmm2
1768 movaps 18(%eax), %xmm3
1769 movaps 34(%eax), %xmm4
1770 movaps 50(%eax), %xmm5
1771 movaps %xmm5, %xmm7
1772 palignr $14, %xmm4, %xmm5
1773 palignr $14, %xmm3, %xmm4
1774 movaps %xmm5, 48(%edx)
1775 palignr $14, %xmm2, %xmm3
1776 lea 64(%eax), %eax
1777 palignr $14, %xmm1, %xmm2
1778 movaps %xmm4, 32(%edx)
1779 movaps %xmm3, 16(%edx)
1780 movaps %xmm7, %xmm1
1781 movaps %xmm2, (%edx)
1782 lea 64(%edx), %edx
1783 sub $64, %ecx
1784 ja L(Shl14LoopStart)
1785
1786L(Shl14LoopLeave):
1787 add $32, %ecx
1788 jle L(shl_end_0)
1789
1790 movaps 2(%eax), %xmm2
1791 movaps 18(%eax), %xmm3
1792 palignr $14, %xmm2, %xmm3
1793 palignr $14, %xmm1, %xmm2
1794
1795 movaps %xmm2, (%edx)
1796 movaps %xmm3, 16(%edx)
1797 lea 32(%edx, %ecx), %edx
1798 lea 32(%eax, %ecx), %eax
1799 POP (%edi)
1800 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1801
1802 CFI_PUSH (%edi)
1803
1804 .p2align 4
1805L(sh_14_no_prefetch):
1806 lea -32(%ecx), %ecx
1807 lea -14(%eax), %eax
1808 xor %edi, %edi
1809
1810 .p2align 4
1811L(sh_14_no_prefetch_loop):
1812 movdqa 16(%eax, %edi), %xmm2
1813 sub $32, %ecx
1814 movdqa 32(%eax, %edi), %xmm3
1815 movdqa %xmm3, %xmm4
1816 palignr $14, %xmm2, %xmm3
1817 palignr $14, %xmm1, %xmm2
1818 lea 32(%edi), %edi
1819 movdqa %xmm2, -32(%edx, %edi)
1820 movdqa %xmm3, -16(%edx, %edi)
1821 jb L(sh_14_end_no_prefetch_loop)
1822
1823 movdqa 16(%eax, %edi), %xmm2
1824 sub $32, %ecx
1825 movdqa 32(%eax, %edi), %xmm3
1826 movdqa %xmm3, %xmm1
1827 palignr $14, %xmm2, %xmm3
1828 palignr $14, %xmm4, %xmm2
1829 lea 32(%edi), %edi
1830 movdqa %xmm2, -32(%edx, %edi)
1831 movdqa %xmm3, -16(%edx, %edi)
1832 jae L(sh_14_no_prefetch_loop)
1833
1834L(sh_14_end_no_prefetch_loop):
1835 lea 32(%ecx), %ecx
1836 add %ecx, %edi
1837 add %edi, %edx
1838 lea 14(%edi, %eax), %eax
1839 POP (%edi)
1840 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1841
1842 CFI_PUSH (%edi)
1843
1844 .p2align 4
1845L(shl_15):
1846# ifndef USE_AS_MEMMOVE
1847 movaps -15(%eax), %xmm1
1848# else
1849 movl DEST+4(%esp), %edi
1850 movaps -15(%eax), %xmm1
1851 movdqu %xmm0, (%edi)
1852# endif
1853# ifdef DATA_CACHE_SIZE_HALF
1854 cmp $DATA_CACHE_SIZE_HALF, %ecx
1855# else
1856# ifdef PIC
1857 SETUP_PIC_REG(bx)
1858 add $_GLOBAL_OFFSET_TABLE_, %ebx
1859 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1860# else
1861 cmp __x86_data_cache_size_half, %ecx
1862# endif
1863# endif
1864 jb L(sh_15_no_prefetch)
1865
1866 lea -64(%ecx), %ecx
1867
1868 .p2align 4
1869L(Shl15LoopStart):
1870 prefetcht0 0x1c0(%eax)
1871 prefetcht0 0x1c0(%edx)
1872 movaps 1(%eax), %xmm2
1873 movaps 17(%eax), %xmm3
1874 movaps 33(%eax), %xmm4
1875 movaps 49(%eax), %xmm5
1876 movaps %xmm5, %xmm7
1877 palignr $15, %xmm4, %xmm5
1878 palignr $15, %xmm3, %xmm4
1879 movaps %xmm5, 48(%edx)
1880 palignr $15, %xmm2, %xmm3
1881 lea 64(%eax), %eax
1882 palignr $15, %xmm1, %xmm2
1883 movaps %xmm4, 32(%edx)
1884 movaps %xmm3, 16(%edx)
1885 movaps %xmm7, %xmm1
1886 movaps %xmm2, (%edx)
1887 lea 64(%edx), %edx
1888 sub $64, %ecx
1889 ja L(Shl15LoopStart)
1890
1891L(Shl15LoopLeave):
1892 add $32, %ecx
1893 jle L(shl_end_0)
1894
1895 movaps 1(%eax), %xmm2
1896 movaps 17(%eax), %xmm3
1897 palignr $15, %xmm2, %xmm3
1898 palignr $15, %xmm1, %xmm2
1899
1900 movaps %xmm2, (%edx)
1901 movaps %xmm3, 16(%edx)
1902 lea 32(%edx, %ecx), %edx
1903 lea 32(%eax, %ecx), %eax
1904 POP (%edi)
1905 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1906
1907 CFI_PUSH (%edi)
1908
1909 .p2align 4
1910L(sh_15_no_prefetch):
1911 lea -32(%ecx), %ecx
1912 lea -15(%eax), %eax
1913 xor %edi, %edi
1914
1915 .p2align 4
1916L(sh_15_no_prefetch_loop):
1917 movdqa 16(%eax, %edi), %xmm2
1918 sub $32, %ecx
1919 movdqa 32(%eax, %edi), %xmm3
1920 movdqa %xmm3, %xmm4
1921 palignr $15, %xmm2, %xmm3
1922 palignr $15, %xmm1, %xmm2
1923 lea 32(%edi), %edi
1924 movdqa %xmm2, -32(%edx, %edi)
1925 movdqa %xmm3, -16(%edx, %edi)
1926 jb L(sh_15_end_no_prefetch_loop)
1927
1928 movdqa 16(%eax, %edi), %xmm2
1929 sub $32, %ecx
1930 movdqa 32(%eax, %edi), %xmm3
1931 movdqa %xmm3, %xmm1
1932 palignr $15, %xmm2, %xmm3
1933 palignr $15, %xmm4, %xmm2
1934 lea 32(%edi), %edi
1935 movdqa %xmm2, -32(%edx, %edi)
1936 movdqa %xmm3, -16(%edx, %edi)
1937 jae L(sh_15_no_prefetch_loop)
1938
1939L(sh_15_end_no_prefetch_loop):
1940 lea 32(%ecx), %ecx
1941 add %ecx, %edi
1942 add %edi, %edx
1943 lea 15(%edi, %eax), %eax
1944 POP (%edi)
1945 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1946
1947 CFI_PUSH (%edi)
1948
1949 .p2align 4
1950L(shl_end_0):
1951 lea 32(%ecx), %ecx
1952 lea (%edx, %ecx), %edx
1953 lea (%eax, %ecx), %eax
1954 POP (%edi)
1955 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1956
1957 .p2align 4
1958L(fwd_write_44bytes):
1959 movq -44(%eax), %xmm0
1960 movq %xmm0, -44(%edx)
1961L(fwd_write_36bytes):
1962 movq -36(%eax), %xmm0
1963 movq %xmm0, -36(%edx)
1964L(fwd_write_28bytes):
1965 movq -28(%eax), %xmm0
1966 movq %xmm0, -28(%edx)
1967L(fwd_write_20bytes):
1968 movq -20(%eax), %xmm0
1969 movq %xmm0, -20(%edx)
1970L(fwd_write_12bytes):
1971 movq -12(%eax), %xmm0
1972 movq %xmm0, -12(%edx)
1973L(fwd_write_4bytes):
1974 movl -4(%eax), %ecx
1975 movl %ecx, -4(%edx)
1976# ifdef USE_AS_MEMPCPY
1977 movl %edx, %eax
1978# else
1979 movl DEST(%esp), %eax
1980# endif
1981 RETURN
1982
1983 .p2align 4
1984L(fwd_write_40bytes):
1985 movq -40(%eax), %xmm0
1986 movq %xmm0, -40(%edx)
1987L(fwd_write_32bytes):
1988 movq -32(%eax), %xmm0
1989 movq %xmm0, -32(%edx)
1990L(fwd_write_24bytes):
1991 movq -24(%eax), %xmm0
1992 movq %xmm0, -24(%edx)
1993L(fwd_write_16bytes):
1994 movq -16(%eax), %xmm0
1995 movq %xmm0, -16(%edx)
1996L(fwd_write_8bytes):
1997 movq -8(%eax), %xmm0
1998 movq %xmm0, -8(%edx)
1999L(fwd_write_0bytes):
2000# ifdef USE_AS_MEMPCPY
2001 movl %edx, %eax
2002# else
2003 movl DEST(%esp), %eax
2004# endif
2005 RETURN
2006
2007 .p2align 4
2008L(fwd_write_5bytes):
2009 movl -5(%eax), %ecx
2010 movl -4(%eax), %eax
2011 movl %ecx, -5(%edx)
2012 movl %eax, -4(%edx)
2013# ifdef USE_AS_MEMPCPY
2014 movl %edx, %eax
2015# else
2016 movl DEST(%esp), %eax
2017# endif
2018 RETURN
2019
2020 .p2align 4
2021L(fwd_write_45bytes):
2022 movq -45(%eax), %xmm0
2023 movq %xmm0, -45(%edx)
2024L(fwd_write_37bytes):
2025 movq -37(%eax), %xmm0
2026 movq %xmm0, -37(%edx)
2027L(fwd_write_29bytes):
2028 movq -29(%eax), %xmm0
2029 movq %xmm0, -29(%edx)
2030L(fwd_write_21bytes):
2031 movq -21(%eax), %xmm0
2032 movq %xmm0, -21(%edx)
2033L(fwd_write_13bytes):
2034 movq -13(%eax), %xmm0
2035 movq %xmm0, -13(%edx)
2036 movl -5(%eax), %ecx
2037 movl %ecx, -5(%edx)
2038 movzbl -1(%eax), %ecx
2039 movb %cl, -1(%edx)
2040# ifdef USE_AS_MEMPCPY
2041 movl %edx, %eax
2042# else
2043 movl DEST(%esp), %eax
2044# endif
2045 RETURN
2046
2047 .p2align 4
2048L(fwd_write_41bytes):
2049 movq -41(%eax), %xmm0
2050 movq %xmm0, -41(%edx)
2051L(fwd_write_33bytes):
2052 movq -33(%eax), %xmm0
2053 movq %xmm0, -33(%edx)
2054L(fwd_write_25bytes):
2055 movq -25(%eax), %xmm0
2056 movq %xmm0, -25(%edx)
2057L(fwd_write_17bytes):
2058 movq -17(%eax), %xmm0
2059 movq %xmm0, -17(%edx)
2060L(fwd_write_9bytes):
2061 movq -9(%eax), %xmm0
2062 movq %xmm0, -9(%edx)
2063L(fwd_write_1bytes):
2064 movzbl -1(%eax), %ecx
2065 movb %cl, -1(%edx)
2066# ifdef USE_AS_MEMPCPY
2067 movl %edx, %eax
2068# else
2069 movl DEST(%esp), %eax
2070# endif
2071 RETURN
2072
2073 .p2align 4
2074L(fwd_write_46bytes):
2075 movq -46(%eax), %xmm0
2076 movq %xmm0, -46(%edx)
2077L(fwd_write_38bytes):
2078 movq -38(%eax), %xmm0
2079 movq %xmm0, -38(%edx)
2080L(fwd_write_30bytes):
2081 movq -30(%eax), %xmm0
2082 movq %xmm0, -30(%edx)
2083L(fwd_write_22bytes):
2084 movq -22(%eax), %xmm0
2085 movq %xmm0, -22(%edx)
2086L(fwd_write_14bytes):
2087 movq -14(%eax), %xmm0
2088 movq %xmm0, -14(%edx)
2089L(fwd_write_6bytes):
2090 movl -6(%eax), %ecx
2091 movl %ecx, -6(%edx)
2092 movzwl -2(%eax), %ecx
2093 movw %cx, -2(%edx)
2094# ifdef USE_AS_MEMPCPY
2095 movl %edx, %eax
2096# else
2097 movl DEST(%esp), %eax
2098# endif
2099 RETURN
2100
2101 .p2align 4
2102L(fwd_write_42bytes):
2103 movq -42(%eax), %xmm0
2104 movq %xmm0, -42(%edx)
2105L(fwd_write_34bytes):
2106 movq -34(%eax), %xmm0
2107 movq %xmm0, -34(%edx)
2108L(fwd_write_26bytes):
2109 movq -26(%eax), %xmm0
2110 movq %xmm0, -26(%edx)
2111L(fwd_write_18bytes):
2112 movq -18(%eax), %xmm0
2113 movq %xmm0, -18(%edx)
2114L(fwd_write_10bytes):
2115 movq -10(%eax), %xmm0
2116 movq %xmm0, -10(%edx)
2117L(fwd_write_2bytes):
2118 movzwl -2(%eax), %ecx
2119 movw %cx, -2(%edx)
2120# ifdef USE_AS_MEMPCPY
2121 movl %edx, %eax
2122# else
2123 movl DEST(%esp), %eax
2124# endif
2125 RETURN
2126
2127 .p2align 4
2128L(fwd_write_47bytes):
2129 movq -47(%eax), %xmm0
2130 movq %xmm0, -47(%edx)
2131L(fwd_write_39bytes):
2132 movq -39(%eax), %xmm0
2133 movq %xmm0, -39(%edx)
2134L(fwd_write_31bytes):
2135 movq -31(%eax), %xmm0
2136 movq %xmm0, -31(%edx)
2137L(fwd_write_23bytes):
2138 movq -23(%eax), %xmm0
2139 movq %xmm0, -23(%edx)
2140L(fwd_write_15bytes):
2141 movq -15(%eax), %xmm0
2142 movq %xmm0, -15(%edx)
2143L(fwd_write_7bytes):
2144 movl -7(%eax), %ecx
2145 movl %ecx, -7(%edx)
2146 movzwl -3(%eax), %ecx
2147 movzbl -1(%eax), %eax
2148 movw %cx, -3(%edx)
2149 movb %al, -1(%edx)
2150# ifdef USE_AS_MEMPCPY
2151 movl %edx, %eax
2152# else
2153 movl DEST(%esp), %eax
2154# endif
2155 RETURN
2156
2157 .p2align 4
2158L(fwd_write_43bytes):
2159 movq -43(%eax), %xmm0
2160 movq %xmm0, -43(%edx)
2161L(fwd_write_35bytes):
2162 movq -35(%eax), %xmm0
2163 movq %xmm0, -35(%edx)
2164L(fwd_write_27bytes):
2165 movq -27(%eax), %xmm0
2166 movq %xmm0, -27(%edx)
2167L(fwd_write_19bytes):
2168 movq -19(%eax), %xmm0
2169 movq %xmm0, -19(%edx)
2170L(fwd_write_11bytes):
2171 movq -11(%eax), %xmm0
2172 movq %xmm0, -11(%edx)
2173L(fwd_write_3bytes):
2174 movzwl -3(%eax), %ecx
2175 movzbl -1(%eax), %eax
2176 movw %cx, -3(%edx)
2177 movb %al, -1(%edx)
2178# ifdef USE_AS_MEMPCPY
2179 movl %edx, %eax
2180# else
2181 movl DEST(%esp), %eax
2182# endif
2183 RETURN
2184
2185 .p2align 4
2186L(fwd_write_40bytes_align):
2187 movdqa -40(%eax), %xmm0
2188 movdqa %xmm0, -40(%edx)
2189L(fwd_write_24bytes_align):
2190 movdqa -24(%eax), %xmm0
2191 movdqa %xmm0, -24(%edx)
2192L(fwd_write_8bytes_align):
2193 movq -8(%eax), %xmm0
2194 movq %xmm0, -8(%edx)
2195L(fwd_write_0bytes_align):
2196# ifdef USE_AS_MEMPCPY
2197 movl %edx, %eax
2198# else
2199 movl DEST(%esp), %eax
2200# endif
2201 RETURN
2202
2203 .p2align 4
2204L(fwd_write_32bytes_align):
2205 movdqa -32(%eax), %xmm0
2206 movdqa %xmm0, -32(%edx)
2207L(fwd_write_16bytes_align):
2208 movdqa -16(%eax), %xmm0
2209 movdqa %xmm0, -16(%edx)
2210# ifdef USE_AS_MEMPCPY
2211 movl %edx, %eax
2212# else
2213 movl DEST(%esp), %eax
2214# endif
2215 RETURN
2216
2217 .p2align 4
2218L(fwd_write_5bytes_align):
2219 movl -5(%eax), %ecx
2220 movl -4(%eax), %eax
2221 movl %ecx, -5(%edx)
2222 movl %eax, -4(%edx)
2223# ifdef USE_AS_MEMPCPY
2224 movl %edx, %eax
2225# else
2226 movl DEST(%esp), %eax
2227# endif
2228 RETURN
2229
2230 .p2align 4
2231L(fwd_write_45bytes_align):
2232 movdqa -45(%eax), %xmm0
2233 movdqa %xmm0, -45(%edx)
2234L(fwd_write_29bytes_align):
2235 movdqa -29(%eax), %xmm0
2236 movdqa %xmm0, -29(%edx)
2237L(fwd_write_13bytes_align):
2238 movq -13(%eax), %xmm0
2239 movq %xmm0, -13(%edx)
2240 movl -5(%eax), %ecx
2241 movl %ecx, -5(%edx)
2242 movzbl -1(%eax), %ecx
2243 movb %cl, -1(%edx)
2244# ifdef USE_AS_MEMPCPY
2245 movl %edx, %eax
2246# else
2247 movl DEST(%esp), %eax
2248# endif
2249 RETURN
2250
2251 .p2align 4
2252L(fwd_write_37bytes_align):
2253 movdqa -37(%eax), %xmm0
2254 movdqa %xmm0, -37(%edx)
2255L(fwd_write_21bytes_align):
2256 movdqa -21(%eax), %xmm0
2257 movdqa %xmm0, -21(%edx)
2258 movl -5(%eax), %ecx
2259 movl %ecx, -5(%edx)
2260 movzbl -1(%eax), %ecx
2261 movb %cl, -1(%edx)
2262# ifdef USE_AS_MEMPCPY
2263 movl %edx, %eax
2264# else
2265 movl DEST(%esp), %eax
2266# endif
2267 RETURN
2268
2269 .p2align 4
2270L(fwd_write_41bytes_align):
2271 movdqa -41(%eax), %xmm0
2272 movdqa %xmm0, -41(%edx)
2273L(fwd_write_25bytes_align):
2274 movdqa -25(%eax), %xmm0
2275 movdqa %xmm0, -25(%edx)
2276L(fwd_write_9bytes_align):
2277 movq -9(%eax), %xmm0
2278 movq %xmm0, -9(%edx)
2279L(fwd_write_1bytes_align):
2280 movzbl -1(%eax), %ecx
2281 movb %cl, -1(%edx)
2282# ifdef USE_AS_MEMPCPY
2283 movl %edx, %eax
2284# else
2285 movl DEST(%esp), %eax
2286# endif
2287 RETURN
2288
2289 .p2align 4
2290L(fwd_write_33bytes_align):
2291 movdqa -33(%eax), %xmm0
2292 movdqa %xmm0, -33(%edx)
2293L(fwd_write_17bytes_align):
2294 movdqa -17(%eax), %xmm0
2295 movdqa %xmm0, -17(%edx)
2296 movzbl -1(%eax), %ecx
2297 movb %cl, -1(%edx)
2298# ifdef USE_AS_MEMPCPY
2299 movl %edx, %eax
2300# else
2301 movl DEST(%esp), %eax
2302# endif
2303 RETURN
2304
2305 .p2align 4
2306L(fwd_write_46bytes_align):
2307 movdqa -46(%eax), %xmm0
2308 movdqa %xmm0, -46(%edx)
2309L(fwd_write_30bytes_align):
2310 movdqa -30(%eax), %xmm0
2311 movdqa %xmm0, -30(%edx)
2312L(fwd_write_14bytes_align):
2313 movq -14(%eax), %xmm0
2314 movq %xmm0, -14(%edx)
2315L(fwd_write_6bytes_align):
2316 movl -6(%eax), %ecx
2317 movl %ecx, -6(%edx)
2318 movzwl -2(%eax), %ecx
2319 movw %cx, -2(%edx)
2320# ifdef USE_AS_MEMPCPY
2321 movl %edx, %eax
2322# else
2323 movl DEST(%esp), %eax
2324# endif
2325 RETURN
2326
2327 .p2align 4
2328L(fwd_write_38bytes_align):
2329 movdqa -38(%eax), %xmm0
2330 movdqa %xmm0, -38(%edx)
2331L(fwd_write_22bytes_align):
2332 movdqa -22(%eax), %xmm0
2333 movdqa %xmm0, -22(%edx)
2334 movl -6(%eax), %ecx
2335 movl %ecx, -6(%edx)
2336 movzwl -2(%eax), %ecx
2337 movw %cx, -2(%edx)
2338# ifdef USE_AS_MEMPCPY
2339 movl %edx, %eax
2340# else
2341 movl DEST(%esp), %eax
2342# endif
2343 RETURN
2344
2345 .p2align 4
2346L(fwd_write_42bytes_align):
2347 movdqa -42(%eax), %xmm0
2348 movdqa %xmm0, -42(%edx)
2349L(fwd_write_26bytes_align):
2350 movdqa -26(%eax), %xmm0
2351 movdqa %xmm0, -26(%edx)
2352L(fwd_write_10bytes_align):
2353 movq -10(%eax), %xmm0
2354 movq %xmm0, -10(%edx)
2355L(fwd_write_2bytes_align):
2356 movzwl -2(%eax), %ecx
2357 movw %cx, -2(%edx)
2358# ifdef USE_AS_MEMPCPY
2359 movl %edx, %eax
2360# else
2361 movl DEST(%esp), %eax
2362# endif
2363 RETURN
2364
2365 .p2align 4
2366L(fwd_write_34bytes_align):
2367 movdqa -34(%eax), %xmm0
2368 movdqa %xmm0, -34(%edx)
2369L(fwd_write_18bytes_align):
2370 movdqa -18(%eax), %xmm0
2371 movdqa %xmm0, -18(%edx)
2372 movzwl -2(%eax), %ecx
2373 movw %cx, -2(%edx)
2374# ifdef USE_AS_MEMPCPY
2375 movl %edx, %eax
2376# else
2377 movl DEST(%esp), %eax
2378# endif
2379 RETURN
2380
2381 .p2align 4
2382L(fwd_write_47bytes_align):
2383 movdqa -47(%eax), %xmm0
2384 movdqa %xmm0, -47(%edx)
2385L(fwd_write_31bytes_align):
2386 movdqa -31(%eax), %xmm0
2387 movdqa %xmm0, -31(%edx)
2388L(fwd_write_15bytes_align):
2389 movq -15(%eax), %xmm0
2390 movq %xmm0, -15(%edx)
2391L(fwd_write_7bytes_align):
2392 movl -7(%eax), %ecx
2393 movl %ecx, -7(%edx)
2394 movzwl -3(%eax), %ecx
2395 movzbl -1(%eax), %eax
2396 movw %cx, -3(%edx)
2397 movb %al, -1(%edx)
2398# ifdef USE_AS_MEMPCPY
2399 movl %edx, %eax
2400# else
2401 movl DEST(%esp), %eax
2402# endif
2403 RETURN
2404
2405 .p2align 4
2406L(fwd_write_39bytes_align):
2407 movdqa -39(%eax), %xmm0
2408 movdqa %xmm0, -39(%edx)
2409L(fwd_write_23bytes_align):
2410 movdqa -23(%eax), %xmm0
2411 movdqa %xmm0, -23(%edx)
2412 movl -7(%eax), %ecx
2413 movl %ecx, -7(%edx)
2414 movzwl -3(%eax), %ecx
2415 movzbl -1(%eax), %eax
2416 movw %cx, -3(%edx)
2417 movb %al, -1(%edx)
2418# ifdef USE_AS_MEMPCPY
2419 movl %edx, %eax
2420# else
2421 movl DEST(%esp), %eax
2422# endif
2423 RETURN
2424
2425 .p2align 4
2426L(fwd_write_43bytes_align):
2427 movdqa -43(%eax), %xmm0
2428 movdqa %xmm0, -43(%edx)
2429L(fwd_write_27bytes_align):
2430 movdqa -27(%eax), %xmm0
2431 movdqa %xmm0, -27(%edx)
2432L(fwd_write_11bytes_align):
2433 movq -11(%eax), %xmm0
2434 movq %xmm0, -11(%edx)
2435L(fwd_write_3bytes_align):
2436 movzwl -3(%eax), %ecx
2437 movzbl -1(%eax), %eax
2438 movw %cx, -3(%edx)
2439 movb %al, -1(%edx)
2440# ifdef USE_AS_MEMPCPY
2441 movl %edx, %eax
2442# else
2443 movl DEST(%esp), %eax
2444# endif
2445 RETURN
2446
2447 .p2align 4
2448L(fwd_write_35bytes_align):
2449 movdqa -35(%eax), %xmm0
2450 movdqa %xmm0, -35(%edx)
2451L(fwd_write_19bytes_align):
2452 movdqa -19(%eax), %xmm0
2453 movdqa %xmm0, -19(%edx)
2454 movzwl -3(%eax), %ecx
2455 movzbl -1(%eax), %eax
2456 movw %cx, -3(%edx)
2457 movb %al, -1(%edx)
2458# ifdef USE_AS_MEMPCPY
2459 movl %edx, %eax
2460# else
2461 movl DEST(%esp), %eax
2462# endif
2463 RETURN
2464
2465 .p2align 4
2466L(fwd_write_44bytes_align):
2467 movdqa -44(%eax), %xmm0
2468 movdqa %xmm0, -44(%edx)
2469L(fwd_write_28bytes_align):
2470 movdqa -28(%eax), %xmm0
2471 movdqa %xmm0, -28(%edx)
2472L(fwd_write_12bytes_align):
2473 movq -12(%eax), %xmm0
2474 movq %xmm0, -12(%edx)
2475L(fwd_write_4bytes_align):
2476 movl -4(%eax), %ecx
2477 movl %ecx, -4(%edx)
2478# ifdef USE_AS_MEMPCPY
2479 movl %edx, %eax
2480# else
2481 movl DEST(%esp), %eax
2482# endif
2483 RETURN
2484
2485 .p2align 4
2486L(fwd_write_36bytes_align):
2487 movdqa -36(%eax), %xmm0
2488 movdqa %xmm0, -36(%edx)
2489L(fwd_write_20bytes_align):
2490 movdqa -20(%eax), %xmm0
2491 movdqa %xmm0, -20(%edx)
2492 movl -4(%eax), %ecx
2493 movl %ecx, -4(%edx)
2494# ifdef USE_AS_MEMPCPY
2495 movl %edx, %eax
2496# else
2497 movl DEST(%esp), %eax
2498# endif
2499 RETURN_END
2500
2501 CFI_PUSH (%edi)
2502
2503 .p2align 4
2504L(large_page):
2505 movdqu (%eax), %xmm1
2506# ifdef USE_AS_MEMMOVE
2507 movl DEST+4(%esp), %edi
2508 movdqu %xmm0, (%edi)
2509# endif
2510 lea 16(%eax), %eax
2511 movntdq %xmm1, (%edx)
2512 lea 16(%edx), %edx
2513 lea -0x90(%ecx), %ecx
2514 POP (%edi)
2515
2516 .p2align 4
2517L(large_page_loop):
2518 movdqu (%eax), %xmm0
2519 movdqu 0x10(%eax), %xmm1
2520 movdqu 0x20(%eax), %xmm2
2521 movdqu 0x30(%eax), %xmm3
2522 movdqu 0x40(%eax), %xmm4
2523 movdqu 0x50(%eax), %xmm5
2524 movdqu 0x60(%eax), %xmm6
2525 movdqu 0x70(%eax), %xmm7
2526 lea 0x80(%eax), %eax
2527
2528 sub $0x80, %ecx
2529 movntdq %xmm0, (%edx)
2530 movntdq %xmm1, 0x10(%edx)
2531 movntdq %xmm2, 0x20(%edx)
2532 movntdq %xmm3, 0x30(%edx)
2533 movntdq %xmm4, 0x40(%edx)
2534 movntdq %xmm5, 0x50(%edx)
2535 movntdq %xmm6, 0x60(%edx)
2536 movntdq %xmm7, 0x70(%edx)
2537 lea 0x80(%edx), %edx
2538 jae L(large_page_loop)
2539 cmp $-0x40, %ecx
2540 lea 0x80(%ecx), %ecx
2541 jl L(large_page_less_64bytes)
2542
2543 movdqu (%eax), %xmm0
2544 movdqu 0x10(%eax), %xmm1
2545 movdqu 0x20(%eax), %xmm2
2546 movdqu 0x30(%eax), %xmm3
2547 lea 0x40(%eax), %eax
2548
2549 movntdq %xmm0, (%edx)
2550 movntdq %xmm1, 0x10(%edx)
2551 movntdq %xmm2, 0x20(%edx)
2552 movntdq %xmm3, 0x30(%edx)
2553 lea 0x40(%edx), %edx
2554 sub $0x40, %ecx
2555L(large_page_less_64bytes):
2556 cmp $32, %ecx
2557 jb L(large_page_less_32bytes)
2558 movdqu (%eax), %xmm0
2559 movdqu 0x10(%eax), %xmm1
2560 lea 0x20(%eax), %eax
2561 movntdq %xmm0, (%edx)
2562 movntdq %xmm1, 0x10(%edx)
2563 lea 0x20(%edx), %edx
2564 sub $0x20, %ecx
2565L(large_page_less_32bytes):
2566 add %ecx, %edx
2567 add %ecx, %eax
2568 sfence
2569 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2570
2571 .p2align 4
2572L(bk_write_44bytes):
2573 movq 36(%eax), %xmm0
2574 movq %xmm0, 36(%edx)
2575L(bk_write_36bytes):
2576 movq 28(%eax), %xmm0
2577 movq %xmm0, 28(%edx)
2578L(bk_write_28bytes):
2579 movq 20(%eax), %xmm0
2580 movq %xmm0, 20(%edx)
2581L(bk_write_20bytes):
2582 movq 12(%eax), %xmm0
2583 movq %xmm0, 12(%edx)
2584L(bk_write_12bytes):
2585 movq 4(%eax), %xmm0
2586 movq %xmm0, 4(%edx)
2587L(bk_write_4bytes):
2588 movl (%eax), %ecx
2589 movl %ecx, (%edx)
2590L(bk_write_0bytes):
2591 movl DEST(%esp), %eax
2592# ifdef USE_AS_MEMPCPY
2593 movl LEN(%esp), %ecx
2594 add %ecx, %eax
2595# endif
2596 RETURN
2597
2598 .p2align 4
2599L(bk_write_40bytes):
2600 movq 32(%eax), %xmm0
2601 movq %xmm0, 32(%edx)
2602L(bk_write_32bytes):
2603 movq 24(%eax), %xmm0
2604 movq %xmm0, 24(%edx)
2605L(bk_write_24bytes):
2606 movq 16(%eax), %xmm0
2607 movq %xmm0, 16(%edx)
2608L(bk_write_16bytes):
2609 movq 8(%eax), %xmm0
2610 movq %xmm0, 8(%edx)
2611L(bk_write_8bytes):
2612 movq (%eax), %xmm0
2613 movq %xmm0, (%edx)
2614 movl DEST(%esp), %eax
2615# ifdef USE_AS_MEMPCPY
2616 movl LEN(%esp), %ecx
2617 add %ecx, %eax
2618# endif
2619 RETURN
2620
2621 .p2align 4
2622L(bk_write_45bytes):
2623 movq 37(%eax), %xmm0
2624 movq %xmm0, 37(%edx)
2625L(bk_write_37bytes):
2626 movq 29(%eax), %xmm0
2627 movq %xmm0, 29(%edx)
2628L(bk_write_29bytes):
2629 movq 21(%eax), %xmm0
2630 movq %xmm0, 21(%edx)
2631L(bk_write_21bytes):
2632 movq 13(%eax), %xmm0
2633 movq %xmm0, 13(%edx)
2634L(bk_write_13bytes):
2635 movq 5(%eax), %xmm0
2636 movq %xmm0, 5(%edx)
2637L(bk_write_5bytes):
2638 movl 1(%eax), %ecx
2639 movl %ecx, 1(%edx)
2640L(bk_write_1bytes):
2641 movzbl (%eax), %ecx
2642 movb %cl, (%edx)
2643 movl DEST(%esp), %eax
2644# ifdef USE_AS_MEMPCPY
2645 movl LEN(%esp), %ecx
2646 add %ecx, %eax
2647# endif
2648 RETURN
2649
2650 .p2align 4
2651L(bk_write_41bytes):
2652 movq 33(%eax), %xmm0
2653 movq %xmm0, 33(%edx)
2654L(bk_write_33bytes):
2655 movq 25(%eax), %xmm0
2656 movq %xmm0, 25(%edx)
2657L(bk_write_25bytes):
2658 movq 17(%eax), %xmm0
2659 movq %xmm0, 17(%edx)
2660L(bk_write_17bytes):
2661 movq 9(%eax), %xmm0
2662 movq %xmm0, 9(%edx)
2663L(bk_write_9bytes):
2664 movq 1(%eax), %xmm0
2665 movq %xmm0, 1(%edx)
2666 movzbl (%eax), %ecx
2667 movb %cl, (%edx)
2668 movl DEST(%esp), %eax
2669# ifdef USE_AS_MEMPCPY
2670 movl LEN(%esp), %ecx
2671 add %ecx, %eax
2672# endif
2673 RETURN
2674
2675 .p2align 4
2676L(bk_write_46bytes):
2677 movq 38(%eax), %xmm0
2678 movq %xmm0, 38(%edx)
2679L(bk_write_38bytes):
2680 movq 30(%eax), %xmm0
2681 movq %xmm0, 30(%edx)
2682L(bk_write_30bytes):
2683 movq 22(%eax), %xmm0
2684 movq %xmm0, 22(%edx)
2685L(bk_write_22bytes):
2686 movq 14(%eax), %xmm0
2687 movq %xmm0, 14(%edx)
2688L(bk_write_14bytes):
2689 movq 6(%eax), %xmm0
2690 movq %xmm0, 6(%edx)
2691L(bk_write_6bytes):
2692 movl 2(%eax), %ecx
2693 movl %ecx, 2(%edx)
2694 movzwl (%eax), %ecx
2695 movw %cx, (%edx)
2696 movl DEST(%esp), %eax
2697# ifdef USE_AS_MEMPCPY
2698 movl LEN(%esp), %ecx
2699 add %ecx, %eax
2700# endif
2701 RETURN
2702
2703 .p2align 4
2704L(bk_write_42bytes):
2705 movq 34(%eax), %xmm0
2706 movq %xmm0, 34(%edx)
2707L(bk_write_34bytes):
2708 movq 26(%eax), %xmm0
2709 movq %xmm0, 26(%edx)
2710L(bk_write_26bytes):
2711 movq 18(%eax), %xmm0
2712 movq %xmm0, 18(%edx)
2713L(bk_write_18bytes):
2714 movq 10(%eax), %xmm0
2715 movq %xmm0, 10(%edx)
2716L(bk_write_10bytes):
2717 movq 2(%eax), %xmm0
2718 movq %xmm0, 2(%edx)
2719L(bk_write_2bytes):
2720 movzwl (%eax), %ecx
2721 movw %cx, (%edx)
2722 movl DEST(%esp), %eax
2723# ifdef USE_AS_MEMPCPY
2724 movl LEN(%esp), %ecx
2725 add %ecx, %eax
2726# endif
2727 RETURN
2728
2729 .p2align 4
2730L(bk_write_47bytes):
2731 movq 39(%eax), %xmm0
2732 movq %xmm0, 39(%edx)
2733L(bk_write_39bytes):
2734 movq 31(%eax), %xmm0
2735 movq %xmm0, 31(%edx)
2736L(bk_write_31bytes):
2737 movq 23(%eax), %xmm0
2738 movq %xmm0, 23(%edx)
2739L(bk_write_23bytes):
2740 movq 15(%eax), %xmm0
2741 movq %xmm0, 15(%edx)
2742L(bk_write_15bytes):
2743 movq 7(%eax), %xmm0
2744 movq %xmm0, 7(%edx)
2745L(bk_write_7bytes):
2746 movl 3(%eax), %ecx
2747 movl %ecx, 3(%edx)
2748 movzwl 1(%eax), %ecx
2749 movw %cx, 1(%edx)
2750 movzbl (%eax), %eax
2751 movb %al, (%edx)
2752 movl DEST(%esp), %eax
2753# ifdef USE_AS_MEMPCPY
2754 movl LEN(%esp), %ecx
2755 add %ecx, %eax
2756# endif
2757 RETURN
2758
2759 .p2align 4
2760L(bk_write_43bytes):
2761 movq 35(%eax), %xmm0
2762 movq %xmm0, 35(%edx)
2763L(bk_write_35bytes):
2764 movq 27(%eax), %xmm0
2765 movq %xmm0, 27(%edx)
2766L(bk_write_27bytes):
2767 movq 19(%eax), %xmm0
2768 movq %xmm0, 19(%edx)
2769L(bk_write_19bytes):
2770 movq 11(%eax), %xmm0
2771 movq %xmm0, 11(%edx)
2772L(bk_write_11bytes):
2773 movq 3(%eax), %xmm0
2774 movq %xmm0, 3(%edx)
2775L(bk_write_3bytes):
2776 movzwl 1(%eax), %ecx
2777 movw %cx, 1(%edx)
2778 movzbl (%eax), %eax
2779 movb %al, (%edx)
2780 movl DEST(%esp), %eax
2781# ifdef USE_AS_MEMPCPY
2782 movl LEN(%esp), %ecx
2783 add %ecx, %eax
2784# endif
2785 RETURN_END
2786
2787
2788 .pushsection .rodata.ssse3,"a",@progbits
2789 .p2align 2
2790L(table_48bytes_fwd):
2791 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2792 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2793 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2794 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2795 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2796 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2797 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2798 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2799 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2800 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2801 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2802 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2803 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2804 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2805 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2806 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2807 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2808 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2809 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2810 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2811 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2812 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2813 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2814 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2815 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2816 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2817 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2818 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2819 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2820 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2821 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2822 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2823 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2824 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2825 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2826 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2827 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2828 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2829 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2830 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2831 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2832 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2833 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2834 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2835 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2836 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2837 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2838 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2839
2840 .p2align 2
2841L(table_48bytes_fwd_align):
2842 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2843 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2844 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2845 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2846 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2847 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2848 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2849 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2850 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2851 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2852 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2853 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2854 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2855 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2856 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2857 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2858 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2859 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2860 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2861 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2862 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2863 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2864 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2865 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2866 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2867 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2868 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2869 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2870 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2871 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2872 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2873 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2874 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2875 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2876 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2877 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2878 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2879 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2880 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2881 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2882 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2883 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2884 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2885 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2886 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2887 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2888 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2889 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2890
2891 .p2align 2
2892L(shl_table):
2893 .int JMPTBL (L(shl_0), L(shl_table))
2894 .int JMPTBL (L(shl_1), L(shl_table))
2895 .int JMPTBL (L(shl_2), L(shl_table))
2896 .int JMPTBL (L(shl_3), L(shl_table))
2897 .int JMPTBL (L(shl_4), L(shl_table))
2898 .int JMPTBL (L(shl_5), L(shl_table))
2899 .int JMPTBL (L(shl_6), L(shl_table))
2900 .int JMPTBL (L(shl_7), L(shl_table))
2901 .int JMPTBL (L(shl_8), L(shl_table))
2902 .int JMPTBL (L(shl_9), L(shl_table))
2903 .int JMPTBL (L(shl_10), L(shl_table))
2904 .int JMPTBL (L(shl_11), L(shl_table))
2905 .int JMPTBL (L(shl_12), L(shl_table))
2906 .int JMPTBL (L(shl_13), L(shl_table))
2907 .int JMPTBL (L(shl_14), L(shl_table))
2908 .int JMPTBL (L(shl_15), L(shl_table))
2909
2910 .p2align 2
2911L(table_48_bytes_bwd):
2912 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2913 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2914 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2915 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2916 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2917 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2918 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2919 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2920 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2921 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2922 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2923 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2924 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2925 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
2926 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
2927 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
2928 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
2929 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
2930 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
2931 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
2932 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
2933 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
2934 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
2935 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
2936 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
2937 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
2938 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
2939 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
2940 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
2941 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
2942 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
2943 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
2944 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
2945 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
2946 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
2947 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
2948 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
2949 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
2950 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
2951 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
2952 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
2953 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
2954 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
2955 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
2956 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
2957 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
2958 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
2959 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
2960
2961 .popsection
2962
2963# ifdef USE_AS_MEMMOVE
2964 .p2align 4
2965L(copy_backward):
2966 PUSH (%edi)
2967 movl %eax, %edi
2968 lea (%ecx,%edx,1),%edx
2969 lea (%ecx,%edi,1),%edi
2970 testl $0x3, %edx
2971 jnz L(bk_align)
2972
2973L(bk_aligned_4):
2974 cmp $64, %ecx
2975 jae L(bk_write_more64bytes)
2976
2977L(bk_write_64bytesless):
2978 cmp $32, %ecx
2979 jb L(bk_write_less32bytes)
2980
2981L(bk_write_more32bytes):
2982 /* Copy 32 bytes at a time. */
2983 sub $32, %ecx
2984 movq -8(%edi), %xmm0
2985 movq %xmm0, -8(%edx)
2986 movq -16(%edi), %xmm0
2987 movq %xmm0, -16(%edx)
2988 movq -24(%edi), %xmm0
2989 movq %xmm0, -24(%edx)
2990 movq -32(%edi), %xmm0
2991 movq %xmm0, -32(%edx)
2992 sub $32, %edx
2993 sub $32, %edi
2994
2995L(bk_write_less32bytes):
2996 movl %edi, %eax
2997 sub %ecx, %edx
2998 sub %ecx, %eax
2999 POP (%edi)
3000L(bk_write_less32bytes_2):
3001 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3002
3003 CFI_PUSH (%edi)
3004
3005 .p2align 4
3006L(bk_align):
3007 cmp $8, %ecx
3008 jbe L(bk_write_less32bytes)
3009 testl $1, %edx
3010 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3011 then (EDX & 2) must be != 0. */
3012 jz L(bk_got2)
3013 sub $1, %edi
3014 sub $1, %ecx
3015 sub $1, %edx
3016 movzbl (%edi), %eax
3017 movb %al, (%edx)
3018
3019 testl $2, %edx
3020 jz L(bk_aligned_4)
3021
3022L(bk_got2):
3023 sub $2, %edi
3024 sub $2, %ecx
3025 sub $2, %edx
3026 movzwl (%edi), %eax
3027 movw %ax, (%edx)
3028 jmp L(bk_aligned_4)
3029
3030 .p2align 4
3031L(bk_write_more64bytes):
3032 /* Check alignment of last byte. */
3033 testl $15, %edx
3034 jz L(bk_ssse3_cpy_pre)
3035
3036/* EDX is aligned 4 bytes, but not 16 bytes. */
3037L(bk_ssse3_align):
3038 sub $4, %edi
3039 sub $4, %ecx
3040 sub $4, %edx
3041 movl (%edi), %eax
3042 movl %eax, (%edx)
3043
3044 testl $15, %edx
3045 jz L(bk_ssse3_cpy_pre)
3046
3047 sub $4, %edi
3048 sub $4, %ecx
3049 sub $4, %edx
3050 movl (%edi), %eax
3051 movl %eax, (%edx)
3052
3053 testl $15, %edx
3054 jz L(bk_ssse3_cpy_pre)
3055
3056 sub $4, %edi
3057 sub $4, %ecx
3058 sub $4, %edx
3059 movl (%edi), %eax
3060 movl %eax, (%edx)
3061
3062L(bk_ssse3_cpy_pre):
3063 cmp $64, %ecx
3064 jb L(bk_write_more32bytes)
3065
3066 .p2align 4
3067L(bk_ssse3_cpy):
3068 sub $64, %edi
3069 sub $64, %ecx
3070 sub $64, %edx
3071 movdqu 0x30(%edi), %xmm3
3072 movdqa %xmm3, 0x30(%edx)
3073 movdqu 0x20(%edi), %xmm2
3074 movdqa %xmm2, 0x20(%edx)
3075 movdqu 0x10(%edi), %xmm1
3076 movdqa %xmm1, 0x10(%edx)
3077 movdqu (%edi), %xmm0
3078 movdqa %xmm0, (%edx)
3079 cmp $64, %ecx
3080 jae L(bk_ssse3_cpy)
3081 jmp L(bk_write_64bytesless)
3082
3083# endif
3084
3085END (MEMCPY)
3086
3087#endif
3088

source code of glibc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S