1/* memcpy with SSSE3 and REP string.
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#if IS_IN (libc) \
22 && (defined SHARED \
23 || defined USE_AS_MEMMOVE \
24 || !defined USE_MULTIARCH)
25
26#include "asm-syntax.h"
27
28#ifndef MEMCPY
29# define MEMCPY __memcpy_ssse3_rep
30# define MEMCPY_CHK __memcpy_chk_ssse3_rep
31#endif
32
33#define DEST PARMS
34#define SRC DEST+4
35#define LEN SRC+4
36
37#define CFI_PUSH(REG) \
38 cfi_adjust_cfa_offset (4); \
39 cfi_rel_offset (REG, 0)
40
41#define CFI_POP(REG) \
42 cfi_adjust_cfa_offset (-4); \
43 cfi_restore (REG)
44
45#define PUSH(REG) pushl REG; CFI_PUSH (REG)
46#define POP(REG) popl REG; CFI_POP (REG)
47
48#ifdef PIC
49# define PARMS 8 /* Preserve EBX. */
50# define ENTRANCE PUSH (%ebx);
51# define RETURN_END POP (%ebx); ret
52# define RETURN RETURN_END; CFI_PUSH (%ebx)
53# define JMPTBL(I, B) I - B
54
55/* Load an entry in a jump table into EBX and branch to it. TABLE is a
56 jump table with relative offsets. INDEX is a register contains the
57 index into the jump table. SCALE is the scale of INDEX. */
58# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
59 /* We first load PC into EBX. */ \
60 SETUP_PIC_REG(bx); \
61 /* Get the address of the jump table. */ \
62 addl $(TABLE - .), %ebx; \
63 /* Get the entry and convert the relative offset to the \
64 absolute address. */ \
65 addl (%ebx,INDEX,SCALE), %ebx; \
66 /* We loaded the jump table. Go. */ \
67 jmp *%ebx
68
69# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
70 addl $(TABLE - .), %ebx
71
72# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
73 addl (%ebx,INDEX,SCALE), %ebx; \
74 /* We loaded the jump table. Go. */ \
75 jmp *%ebx
76#else
77# define PARMS 4
78# define ENTRANCE
79# define RETURN_END ret
80# define RETURN RETURN_END
81# define JMPTBL(I, B) I
82
83/* Branch to an entry in a jump table. TABLE is a jump table with
84 absolute offsets. INDEX is a register contains the index into the
85 jump table. SCALE is the scale of INDEX. */
86# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
87 jmp *TABLE(,INDEX,SCALE)
88
89# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
90
91# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
92 jmp *TABLE(,INDEX,SCALE)
93#endif
94
95 .section .text.ssse3,"ax",@progbits
96#ifdef SHARED
97ENTRY (MEMCPY_CHK)
98 movl 12(%esp), %eax
99 cmpl %eax, 16(%esp)
100 jb HIDDEN_JUMPTARGET (__chk_fail)
101END (MEMCPY_CHK)
102#endif
103ENTRY (MEMCPY)
104 ENTRANCE
105 movl LEN(%esp), %ecx
106 movl SRC(%esp), %eax
107 movl DEST(%esp), %edx
108
109#ifdef USE_AS_MEMMOVE
110 cmp %eax, %edx
111 jb L(copy_forward)
112 je L(fwd_write_0bytes)
113 cmp $48, %ecx
114 jb L(bk_write_less48bytes)
115 add %ecx, %eax
116 cmp %eax, %edx
117 movl SRC(%esp), %eax
118 jb L(copy_backward)
119
120L(copy_forward):
121#endif
122 cmp $48, %ecx
123 jae L(48bytesormore)
124
125L(fwd_write_less32bytes):
126#ifndef USE_AS_MEMMOVE
127 cmp %dl, %al
128 jb L(bk_write)
129#endif
130 add %ecx, %edx
131 add %ecx, %eax
132 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
133#ifndef USE_AS_MEMMOVE
134L(bk_write):
135 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
136#endif
137
138 ALIGN (4)
139/* ECX > 32 and EDX is 4 byte aligned. */
140L(48bytesormore):
141 movdqu (%eax), %xmm0
142 PUSH (%edi)
143 movl %edx, %edi
144 and $-16, %edx
145 PUSH (%esi)
146 cfi_remember_state
147 add $16, %edx
148 movl %edi, %esi
149 sub %edx, %edi
150 add %edi, %ecx
151 sub %edi, %eax
152
153#ifdef SHARED_CACHE_SIZE_HALF
154 cmp $SHARED_CACHE_SIZE_HALF, %ecx
155#else
156# ifdef PIC
157 SETUP_PIC_REG(bx)
158 add $_GLOBAL_OFFSET_TABLE_, %ebx
159 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
160# else
161 cmp __x86_shared_cache_size_half, %ecx
162# endif
163#endif
164
165 mov %eax, %edi
166 jae L(large_page)
167 and $0xf, %edi
168 jz L(shl_0)
169
170 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
171
172 ALIGN (4)
173L(shl_0):
174 movdqu %xmm0, (%esi)
175 xor %edi, %edi
176 cmp $127, %ecx
177 ja L(shl_0_gobble)
178 lea -32(%ecx), %ecx
179L(shl_0_loop):
180 movdqa (%eax, %edi), %xmm0
181 movdqa 16(%eax, %edi), %xmm1
182 sub $32, %ecx
183 movdqa %xmm0, (%edx, %edi)
184 movdqa %xmm1, 16(%edx, %edi)
185 lea 32(%edi), %edi
186 jb L(shl_0_end)
187
188 movdqa (%eax, %edi), %xmm0
189 movdqa 16(%eax, %edi), %xmm1
190 sub $32, %ecx
191 movdqa %xmm0, (%edx, %edi)
192 movdqa %xmm1, 16(%edx, %edi)
193 lea 32(%edi), %edi
194 jb L(shl_0_end)
195
196 movdqa (%eax, %edi), %xmm0
197 movdqa 16(%eax, %edi), %xmm1
198 sub $32, %ecx
199 movdqa %xmm0, (%edx, %edi)
200 movdqa %xmm1, 16(%edx, %edi)
201 lea 32(%edi), %edi
202 jb L(shl_0_end)
203
204 movdqa (%eax, %edi), %xmm0
205 movdqa 16(%eax, %edi), %xmm1
206 sub $32, %ecx
207 movdqa %xmm0, (%edx, %edi)
208 movdqa %xmm1, 16(%edx, %edi)
209 lea 32(%edi), %edi
210L(shl_0_end):
211 lea 32(%ecx), %ecx
212 add %ecx, %edi
213 add %edi, %edx
214 add %edi, %eax
215 POP (%esi)
216 POP (%edi)
217 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
218
219 cfi_restore_state
220 cfi_remember_state
221L(shl_0_gobble):
222
223#ifdef DATA_CACHE_SIZE_HALF
224 cmp $DATA_CACHE_SIZE_HALF, %ecx
225#else
226# ifdef PIC
227 SETUP_PIC_REG(bx)
228 add $_GLOBAL_OFFSET_TABLE_, %ebx
229 mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi
230# else
231 mov __x86_data_cache_size_half, %edi
232# endif
233#endif
234 mov %edi, %esi
235 shr $3, %esi
236 sub %esi, %edi
237 cmp %edi, %ecx
238 jae L(shl_0_gobble_mem_start)
239 sub $128, %ecx
240 ALIGN (4)
241L(shl_0_gobble_cache_loop):
242 movdqa (%eax), %xmm0
243 movaps 0x10(%eax), %xmm1
244 movaps 0x20(%eax), %xmm2
245 movaps 0x30(%eax), %xmm3
246 movaps 0x40(%eax), %xmm4
247 movaps 0x50(%eax), %xmm5
248 movaps 0x60(%eax), %xmm6
249 movaps 0x70(%eax), %xmm7
250 lea 0x80(%eax), %eax
251 sub $128, %ecx
252 movdqa %xmm0, (%edx)
253 movaps %xmm1, 0x10(%edx)
254 movaps %xmm2, 0x20(%edx)
255 movaps %xmm3, 0x30(%edx)
256 movaps %xmm4, 0x40(%edx)
257 movaps %xmm5, 0x50(%edx)
258 movaps %xmm6, 0x60(%edx)
259 movaps %xmm7, 0x70(%edx)
260 lea 0x80(%edx), %edx
261
262 jae L(shl_0_gobble_cache_loop)
263 add $0x80, %ecx
264 cmp $0x40, %ecx
265 jb L(shl_0_cache_less_64bytes)
266
267 movdqa (%eax), %xmm0
268 sub $0x40, %ecx
269 movdqa 0x10(%eax), %xmm1
270
271 movdqa %xmm0, (%edx)
272 movdqa %xmm1, 0x10(%edx)
273
274 movdqa 0x20(%eax), %xmm0
275 movdqa 0x30(%eax), %xmm1
276 add $0x40, %eax
277
278 movdqa %xmm0, 0x20(%edx)
279 movdqa %xmm1, 0x30(%edx)
280 add $0x40, %edx
281L(shl_0_cache_less_64bytes):
282 cmp $0x20, %ecx
283 jb L(shl_0_cache_less_32bytes)
284 movdqa (%eax), %xmm0
285 sub $0x20, %ecx
286 movdqa 0x10(%eax), %xmm1
287 add $0x20, %eax
288 movdqa %xmm0, (%edx)
289 movdqa %xmm1, 0x10(%edx)
290 add $0x20, %edx
291L(shl_0_cache_less_32bytes):
292 cmp $0x10, %ecx
293 jb L(shl_0_cache_less_16bytes)
294 sub $0x10, %ecx
295 movdqa (%eax), %xmm0
296 add $0x10, %eax
297 movdqa %xmm0, (%edx)
298 add $0x10, %edx
299L(shl_0_cache_less_16bytes):
300 add %ecx, %edx
301 add %ecx, %eax
302 POP (%esi)
303 POP (%edi)
304 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
305
306 cfi_restore_state
307 cfi_remember_state
308 ALIGN (4)
309L(shl_0_gobble_mem_start):
310 cmp %al, %dl
311 je L(copy_page_by_rep)
312 sub $128, %ecx
313L(shl_0_gobble_mem_loop):
314 prefetchnta 0x1c0(%eax)
315 prefetchnta 0x280(%eax)
316 prefetchnta 0x1c0(%edx)
317 prefetchnta 0x280(%edx)
318
319 movdqa (%eax), %xmm0
320 movaps 0x10(%eax), %xmm1
321 movaps 0x20(%eax), %xmm2
322 movaps 0x30(%eax), %xmm3
323 movaps 0x40(%eax), %xmm4
324 movaps 0x50(%eax), %xmm5
325 movaps 0x60(%eax), %xmm6
326 movaps 0x70(%eax), %xmm7
327 lea 0x80(%eax), %eax
328 sub $0x80, %ecx
329 movdqa %xmm0, (%edx)
330 movaps %xmm1, 0x10(%edx)
331 movaps %xmm2, 0x20(%edx)
332 movaps %xmm3, 0x30(%edx)
333 movaps %xmm4, 0x40(%edx)
334 movaps %xmm5, 0x50(%edx)
335 movaps %xmm6, 0x60(%edx)
336 movaps %xmm7, 0x70(%edx)
337 lea 0x80(%edx), %edx
338
339 jae L(shl_0_gobble_mem_loop)
340 add $0x80, %ecx
341 cmp $0x40, %ecx
342 jb L(shl_0_mem_less_64bytes)
343
344 movdqa (%eax), %xmm0
345 sub $0x40, %ecx
346 movdqa 0x10(%eax), %xmm1
347
348 movdqa %xmm0, (%edx)
349 movdqa %xmm1, 0x10(%edx)
350
351 movdqa 0x20(%eax), %xmm0
352 movdqa 0x30(%eax), %xmm1
353 add $0x40, %eax
354
355 movdqa %xmm0, 0x20(%edx)
356 movdqa %xmm1, 0x30(%edx)
357 add $0x40, %edx
358L(shl_0_mem_less_64bytes):
359 cmp $0x20, %ecx
360 jb L(shl_0_mem_less_32bytes)
361 movdqa (%eax), %xmm0
362 sub $0x20, %ecx
363 movdqa 0x10(%eax), %xmm1
364 add $0x20, %eax
365 movdqa %xmm0, (%edx)
366 movdqa %xmm1, 0x10(%edx)
367 add $0x20, %edx
368L(shl_0_mem_less_32bytes):
369 cmp $0x10, %ecx
370 jb L(shl_0_mem_less_16bytes)
371 sub $0x10, %ecx
372 movdqa (%eax), %xmm0
373 add $0x10, %eax
374 movdqa %xmm0, (%edx)
375 add $0x10, %edx
376L(shl_0_mem_less_16bytes):
377 add %ecx, %edx
378 add %ecx, %eax
379 POP (%esi)
380 POP (%edi)
381 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
382
383 cfi_restore_state
384 cfi_remember_state
385 ALIGN (4)
386L(shl_1):
387 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
388 sub $1, %eax
389 movaps (%eax), %xmm1
390 xor %edi, %edi
391 sub $32, %ecx
392 movdqu %xmm0, (%esi)
393 POP (%esi)
394L(shl_1_loop):
395
396 movdqa 16(%eax, %edi), %xmm2
397 sub $32, %ecx
398 movdqa 32(%eax, %edi), %xmm3
399 movdqa %xmm3, %xmm4
400 palignr $1, %xmm2, %xmm3
401 palignr $1, %xmm1, %xmm2
402 lea 32(%edi), %edi
403 movdqa %xmm2, -32(%edx, %edi)
404 movdqa %xmm3, -16(%edx, %edi)
405
406 jb L(shl_1_end)
407
408 movdqa 16(%eax, %edi), %xmm2
409 sub $32, %ecx
410 movdqa 32(%eax, %edi), %xmm3
411 movdqa %xmm3, %xmm1
412 palignr $1, %xmm2, %xmm3
413 palignr $1, %xmm4, %xmm2
414 lea 32(%edi), %edi
415 movdqa %xmm2, -32(%edx, %edi)
416 movdqa %xmm3, -16(%edx, %edi)
417
418 jae L(shl_1_loop)
419
420L(shl_1_end):
421 add $32, %ecx
422 add %ecx, %edi
423 add %edi, %edx
424 lea 1(%edi, %eax), %eax
425 POP (%edi)
426 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
427
428 cfi_restore_state
429 cfi_remember_state
430 ALIGN (4)
431L(shl_2):
432 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
433 sub $2, %eax
434 movaps (%eax), %xmm1
435 xor %edi, %edi
436 sub $32, %ecx
437 movdqu %xmm0, (%esi)
438 POP (%esi)
439L(shl_2_loop):
440
441 movdqa 16(%eax, %edi), %xmm2
442 sub $32, %ecx
443 movdqa 32(%eax, %edi), %xmm3
444 movdqa %xmm3, %xmm4
445 palignr $2, %xmm2, %xmm3
446 palignr $2, %xmm1, %xmm2
447 lea 32(%edi), %edi
448 movdqa %xmm2, -32(%edx, %edi)
449 movdqa %xmm3, -16(%edx, %edi)
450
451 jb L(shl_2_end)
452
453 movdqa 16(%eax, %edi), %xmm2
454 sub $32, %ecx
455 movdqa 32(%eax, %edi), %xmm3
456 movdqa %xmm3, %xmm1
457 palignr $2, %xmm2, %xmm3
458 palignr $2, %xmm4, %xmm2
459 lea 32(%edi), %edi
460 movdqa %xmm2, -32(%edx, %edi)
461 movdqa %xmm3, -16(%edx, %edi)
462
463 jae L(shl_2_loop)
464
465L(shl_2_end):
466 add $32, %ecx
467 add %ecx, %edi
468 add %edi, %edx
469 lea 2(%edi, %eax), %eax
470 POP (%edi)
471 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
472
473 cfi_restore_state
474 cfi_remember_state
475 ALIGN (4)
476L(shl_3):
477 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
478 sub $3, %eax
479 movaps (%eax), %xmm1
480 xor %edi, %edi
481 sub $32, %ecx
482 movdqu %xmm0, (%esi)
483 POP (%esi)
484L(shl_3_loop):
485
486 movdqa 16(%eax, %edi), %xmm2
487 sub $32, %ecx
488 movdqa 32(%eax, %edi), %xmm3
489 movdqa %xmm3, %xmm4
490 palignr $3, %xmm2, %xmm3
491 palignr $3, %xmm1, %xmm2
492 lea 32(%edi), %edi
493 movdqa %xmm2, -32(%edx, %edi)
494 movdqa %xmm3, -16(%edx, %edi)
495
496 jb L(shl_3_end)
497
498 movdqa 16(%eax, %edi), %xmm2
499 sub $32, %ecx
500 movdqa 32(%eax, %edi), %xmm3
501 movdqa %xmm3, %xmm1
502 palignr $3, %xmm2, %xmm3
503 palignr $3, %xmm4, %xmm2
504 lea 32(%edi), %edi
505 movdqa %xmm2, -32(%edx, %edi)
506 movdqa %xmm3, -16(%edx, %edi)
507
508 jae L(shl_3_loop)
509
510L(shl_3_end):
511 add $32, %ecx
512 add %ecx, %edi
513 add %edi, %edx
514 lea 3(%edi, %eax), %eax
515 POP (%edi)
516 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
517
518 cfi_restore_state
519 cfi_remember_state
520 ALIGN (4)
521L(shl_4):
522 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
523 sub $4, %eax
524 movaps (%eax), %xmm1
525 xor %edi, %edi
526 sub $32, %ecx
527 movdqu %xmm0, (%esi)
528 POP (%esi)
529L(shl_4_loop):
530
531 movdqa 16(%eax, %edi), %xmm2
532 sub $32, %ecx
533 movdqa 32(%eax, %edi), %xmm3
534 movdqa %xmm3, %xmm4
535 palignr $4, %xmm2, %xmm3
536 palignr $4, %xmm1, %xmm2
537 lea 32(%edi), %edi
538 movdqa %xmm2, -32(%edx, %edi)
539 movdqa %xmm3, -16(%edx, %edi)
540
541 jb L(shl_4_end)
542
543 movdqa 16(%eax, %edi), %xmm2
544 sub $32, %ecx
545 movdqa 32(%eax, %edi), %xmm3
546 movdqa %xmm3, %xmm1
547 palignr $4, %xmm2, %xmm3
548 palignr $4, %xmm4, %xmm2
549 lea 32(%edi), %edi
550 movdqa %xmm2, -32(%edx, %edi)
551 movdqa %xmm3, -16(%edx, %edi)
552
553 jae L(shl_4_loop)
554
555L(shl_4_end):
556 add $32, %ecx
557 add %ecx, %edi
558 add %edi, %edx
559 lea 4(%edi, %eax), %eax
560 POP (%edi)
561 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
562
563 cfi_restore_state
564 cfi_remember_state
565 ALIGN (4)
566L(shl_5):
567 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
568 sub $5, %eax
569 movaps (%eax), %xmm1
570 xor %edi, %edi
571 sub $32, %ecx
572 movdqu %xmm0, (%esi)
573 POP (%esi)
574L(shl_5_loop):
575
576 movdqa 16(%eax, %edi), %xmm2
577 sub $32, %ecx
578 movdqa 32(%eax, %edi), %xmm3
579 movdqa %xmm3, %xmm4
580 palignr $5, %xmm2, %xmm3
581 palignr $5, %xmm1, %xmm2
582 lea 32(%edi), %edi
583 movdqa %xmm2, -32(%edx, %edi)
584 movdqa %xmm3, -16(%edx, %edi)
585
586 jb L(shl_5_end)
587
588 movdqa 16(%eax, %edi), %xmm2
589 sub $32, %ecx
590 movdqa 32(%eax, %edi), %xmm3
591 movdqa %xmm3, %xmm1
592 palignr $5, %xmm2, %xmm3
593 palignr $5, %xmm4, %xmm2
594 lea 32(%edi), %edi
595 movdqa %xmm2, -32(%edx, %edi)
596 movdqa %xmm3, -16(%edx, %edi)
597
598 jae L(shl_5_loop)
599
600L(shl_5_end):
601 add $32, %ecx
602 add %ecx, %edi
603 add %edi, %edx
604 lea 5(%edi, %eax), %eax
605 POP (%edi)
606 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
607
608 cfi_restore_state
609 cfi_remember_state
610 ALIGN (4)
611L(shl_6):
612 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
613 sub $6, %eax
614 movaps (%eax), %xmm1
615 xor %edi, %edi
616 sub $32, %ecx
617 movdqu %xmm0, (%esi)
618 POP (%esi)
619L(shl_6_loop):
620
621 movdqa 16(%eax, %edi), %xmm2
622 sub $32, %ecx
623 movdqa 32(%eax, %edi), %xmm3
624 movdqa %xmm3, %xmm4
625 palignr $6, %xmm2, %xmm3
626 palignr $6, %xmm1, %xmm2
627 lea 32(%edi), %edi
628 movdqa %xmm2, -32(%edx, %edi)
629 movdqa %xmm3, -16(%edx, %edi)
630
631 jb L(shl_6_end)
632
633 movdqa 16(%eax, %edi), %xmm2
634 sub $32, %ecx
635 movdqa 32(%eax, %edi), %xmm3
636 movdqa %xmm3, %xmm1
637 palignr $6, %xmm2, %xmm3
638 palignr $6, %xmm4, %xmm2
639 lea 32(%edi), %edi
640 movdqa %xmm2, -32(%edx, %edi)
641 movdqa %xmm3, -16(%edx, %edi)
642
643 jae L(shl_6_loop)
644
645L(shl_6_end):
646 add $32, %ecx
647 add %ecx, %edi
648 add %edi, %edx
649 lea 6(%edi, %eax), %eax
650 POP (%edi)
651 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
652
653 cfi_restore_state
654 cfi_remember_state
655 ALIGN (4)
656L(shl_7):
657 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
658 sub $7, %eax
659 movaps (%eax), %xmm1
660 xor %edi, %edi
661 sub $32, %ecx
662 movdqu %xmm0, (%esi)
663 POP (%esi)
664L(shl_7_loop):
665
666 movdqa 16(%eax, %edi), %xmm2
667 sub $32, %ecx
668 movdqa 32(%eax, %edi), %xmm3
669 movdqa %xmm3, %xmm4
670 palignr $7, %xmm2, %xmm3
671 palignr $7, %xmm1, %xmm2
672 lea 32(%edi), %edi
673 movdqa %xmm2, -32(%edx, %edi)
674 movdqa %xmm3, -16(%edx, %edi)
675
676 jb L(shl_7_end)
677
678 movdqa 16(%eax, %edi), %xmm2
679 sub $32, %ecx
680 movdqa 32(%eax, %edi), %xmm3
681 movdqa %xmm3, %xmm1
682 palignr $7, %xmm2, %xmm3
683 palignr $7, %xmm4, %xmm2
684 lea 32(%edi), %edi
685 movdqa %xmm2, -32(%edx, %edi)
686 movdqa %xmm3, -16(%edx, %edi)
687
688 jae L(shl_7_loop)
689
690L(shl_7_end):
691 add $32, %ecx
692 add %ecx, %edi
693 add %edi, %edx
694 lea 7(%edi, %eax), %eax
695 POP (%edi)
696 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
697
698 cfi_restore_state
699 cfi_remember_state
700 ALIGN (4)
701L(shl_8):
702 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
703 sub $8, %eax
704 movaps (%eax), %xmm1
705 xor %edi, %edi
706 sub $32, %ecx
707 movdqu %xmm0, (%esi)
708 POP (%esi)
709L(shl_8_loop):
710
711 movdqa 16(%eax, %edi), %xmm2
712 sub $32, %ecx
713 movdqa 32(%eax, %edi), %xmm3
714 movdqa %xmm3, %xmm4
715 palignr $8, %xmm2, %xmm3
716 palignr $8, %xmm1, %xmm2
717 lea 32(%edi), %edi
718 movdqa %xmm2, -32(%edx, %edi)
719 movdqa %xmm3, -16(%edx, %edi)
720
721 jb L(shl_8_end)
722
723 movdqa 16(%eax, %edi), %xmm2
724 sub $32, %ecx
725 movdqa 32(%eax, %edi), %xmm3
726 movdqa %xmm3, %xmm1
727 palignr $8, %xmm2, %xmm3
728 palignr $8, %xmm4, %xmm2
729 lea 32(%edi), %edi
730 movdqa %xmm2, -32(%edx, %edi)
731 movdqa %xmm3, -16(%edx, %edi)
732
733 jae L(shl_8_loop)
734
735L(shl_8_end):
736 add $32, %ecx
737 add %ecx, %edi
738 add %edi, %edx
739 lea 8(%edi, %eax), %eax
740 POP (%edi)
741 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
742
743 cfi_restore_state
744 cfi_remember_state
745 ALIGN (4)
746L(shl_9):
747 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
748 sub $9, %eax
749 movaps (%eax), %xmm1
750 xor %edi, %edi
751 sub $32, %ecx
752 movdqu %xmm0, (%esi)
753 POP (%esi)
754L(shl_9_loop):
755
756 movdqa 16(%eax, %edi), %xmm2
757 sub $32, %ecx
758 movdqa 32(%eax, %edi), %xmm3
759 movdqa %xmm3, %xmm4
760 palignr $9, %xmm2, %xmm3
761 palignr $9, %xmm1, %xmm2
762 lea 32(%edi), %edi
763 movdqa %xmm2, -32(%edx, %edi)
764 movdqa %xmm3, -16(%edx, %edi)
765
766 jb L(shl_9_end)
767
768 movdqa 16(%eax, %edi), %xmm2
769 sub $32, %ecx
770 movdqa 32(%eax, %edi), %xmm3
771 movdqa %xmm3, %xmm1
772 palignr $9, %xmm2, %xmm3
773 palignr $9, %xmm4, %xmm2
774 lea 32(%edi), %edi
775 movdqa %xmm2, -32(%edx, %edi)
776 movdqa %xmm3, -16(%edx, %edi)
777
778 jae L(shl_9_loop)
779
780L(shl_9_end):
781 add $32, %ecx
782 add %ecx, %edi
783 add %edi, %edx
784 lea 9(%edi, %eax), %eax
785 POP (%edi)
786 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
787
788 cfi_restore_state
789 cfi_remember_state
790 ALIGN (4)
791L(shl_10):
792 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
793 sub $10, %eax
794 movaps (%eax), %xmm1
795 xor %edi, %edi
796 sub $32, %ecx
797 movdqu %xmm0, (%esi)
798 POP (%esi)
799L(shl_10_loop):
800
801 movdqa 16(%eax, %edi), %xmm2
802 sub $32, %ecx
803 movdqa 32(%eax, %edi), %xmm3
804 movdqa %xmm3, %xmm4
805 palignr $10, %xmm2, %xmm3
806 palignr $10, %xmm1, %xmm2
807 lea 32(%edi), %edi
808 movdqa %xmm2, -32(%edx, %edi)
809 movdqa %xmm3, -16(%edx, %edi)
810
811 jb L(shl_10_end)
812
813 movdqa 16(%eax, %edi), %xmm2
814 sub $32, %ecx
815 movdqa 32(%eax, %edi), %xmm3
816 movdqa %xmm3, %xmm1
817 palignr $10, %xmm2, %xmm3
818 palignr $10, %xmm4, %xmm2
819 lea 32(%edi), %edi
820 movdqa %xmm2, -32(%edx, %edi)
821 movdqa %xmm3, -16(%edx, %edi)
822
823 jae L(shl_10_loop)
824
825L(shl_10_end):
826 add $32, %ecx
827 add %ecx, %edi
828 add %edi, %edx
829 lea 10(%edi, %eax), %eax
830 POP (%edi)
831 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
832
833 cfi_restore_state
834 cfi_remember_state
835 ALIGN (4)
836L(shl_11):
837 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
838 sub $11, %eax
839 movaps (%eax), %xmm1
840 xor %edi, %edi
841 sub $32, %ecx
842 movdqu %xmm0, (%esi)
843 POP (%esi)
844L(shl_11_loop):
845
846 movdqa 16(%eax, %edi), %xmm2
847 sub $32, %ecx
848 movdqa 32(%eax, %edi), %xmm3
849 movdqa %xmm3, %xmm4
850 palignr $11, %xmm2, %xmm3
851 palignr $11, %xmm1, %xmm2
852 lea 32(%edi), %edi
853 movdqa %xmm2, -32(%edx, %edi)
854 movdqa %xmm3, -16(%edx, %edi)
855
856 jb L(shl_11_end)
857
858 movdqa 16(%eax, %edi), %xmm2
859 sub $32, %ecx
860 movdqa 32(%eax, %edi), %xmm3
861 movdqa %xmm3, %xmm1
862 palignr $11, %xmm2, %xmm3
863 palignr $11, %xmm4, %xmm2
864 lea 32(%edi), %edi
865 movdqa %xmm2, -32(%edx, %edi)
866 movdqa %xmm3, -16(%edx, %edi)
867
868 jae L(shl_11_loop)
869
870L(shl_11_end):
871 add $32, %ecx
872 add %ecx, %edi
873 add %edi, %edx
874 lea 11(%edi, %eax), %eax
875 POP (%edi)
876 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
877
878 cfi_restore_state
879 cfi_remember_state
880 ALIGN (4)
881L(shl_12):
882 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
883 sub $12, %eax
884 movaps (%eax), %xmm1
885 xor %edi, %edi
886 sub $32, %ecx
887 movdqu %xmm0, (%esi)
888 POP (%esi)
889L(shl_12_loop):
890
891 movdqa 16(%eax, %edi), %xmm2
892 sub $32, %ecx
893 movdqa 32(%eax, %edi), %xmm3
894 movdqa %xmm3, %xmm4
895 palignr $12, %xmm2, %xmm3
896 palignr $12, %xmm1, %xmm2
897 lea 32(%edi), %edi
898 movdqa %xmm2, -32(%edx, %edi)
899 movdqa %xmm3, -16(%edx, %edi)
900
901 jb L(shl_12_end)
902
903 movdqa 16(%eax, %edi), %xmm2
904 sub $32, %ecx
905 movdqa 32(%eax, %edi), %xmm3
906 movdqa %xmm3, %xmm1
907 palignr $12, %xmm2, %xmm3
908 palignr $12, %xmm4, %xmm2
909 lea 32(%edi), %edi
910 movdqa %xmm2, -32(%edx, %edi)
911 movdqa %xmm3, -16(%edx, %edi)
912
913 jae L(shl_12_loop)
914
915L(shl_12_end):
916 add $32, %ecx
917 add %ecx, %edi
918 add %edi, %edx
919 lea 12(%edi, %eax), %eax
920 POP (%edi)
921 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
922
923 cfi_restore_state
924 cfi_remember_state
925 ALIGN (4)
926L(shl_13):
927 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
928 sub $13, %eax
929 movaps (%eax), %xmm1
930 xor %edi, %edi
931 sub $32, %ecx
932 movdqu %xmm0, (%esi)
933 POP (%esi)
934L(shl_13_loop):
935
936 movdqa 16(%eax, %edi), %xmm2
937 sub $32, %ecx
938 movdqa 32(%eax, %edi), %xmm3
939 movdqa %xmm3, %xmm4
940 palignr $13, %xmm2, %xmm3
941 palignr $13, %xmm1, %xmm2
942 lea 32(%edi), %edi
943 movdqa %xmm2, -32(%edx, %edi)
944 movdqa %xmm3, -16(%edx, %edi)
945
946 jb L(shl_13_end)
947
948 movdqa 16(%eax, %edi), %xmm2
949 sub $32, %ecx
950 movdqa 32(%eax, %edi), %xmm3
951 movdqa %xmm3, %xmm1
952 palignr $13, %xmm2, %xmm3
953 palignr $13, %xmm4, %xmm2
954 lea 32(%edi), %edi
955 movdqa %xmm2, -32(%edx, %edi)
956 movdqa %xmm3, -16(%edx, %edi)
957
958 jae L(shl_13_loop)
959
960L(shl_13_end):
961 add $32, %ecx
962 add %ecx, %edi
963 add %edi, %edx
964 lea 13(%edi, %eax), %eax
965 POP (%edi)
966 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
967
968 cfi_restore_state
969 cfi_remember_state
970 ALIGN (4)
971L(shl_14):
972 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
973 sub $14, %eax
974 movaps (%eax), %xmm1
975 xor %edi, %edi
976 sub $32, %ecx
977 movdqu %xmm0, (%esi)
978 POP (%esi)
979L(shl_14_loop):
980
981 movdqa 16(%eax, %edi), %xmm2
982 sub $32, %ecx
983 movdqa 32(%eax, %edi), %xmm3
984 movdqa %xmm3, %xmm4
985 palignr $14, %xmm2, %xmm3
986 palignr $14, %xmm1, %xmm2
987 lea 32(%edi), %edi
988 movdqa %xmm2, -32(%edx, %edi)
989 movdqa %xmm3, -16(%edx, %edi)
990
991 jb L(shl_14_end)
992
993 movdqa 16(%eax, %edi), %xmm2
994 sub $32, %ecx
995 movdqa 32(%eax, %edi), %xmm3
996 movdqa %xmm3, %xmm1
997 palignr $14, %xmm2, %xmm3
998 palignr $14, %xmm4, %xmm2
999 lea 32(%edi), %edi
1000 movdqa %xmm2, -32(%edx, %edi)
1001 movdqa %xmm3, -16(%edx, %edi)
1002
1003 jae L(shl_14_loop)
1004
1005L(shl_14_end):
1006 add $32, %ecx
1007 add %ecx, %edi
1008 add %edi, %edx
1009 lea 14(%edi, %eax), %eax
1010 POP (%edi)
1011 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1012
1013 cfi_restore_state
1014 cfi_remember_state
1015 ALIGN (4)
1016L(shl_15):
1017 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1018 sub $15, %eax
1019 movaps (%eax), %xmm1
1020 xor %edi, %edi
1021 sub $32, %ecx
1022 movdqu %xmm0, (%esi)
1023 POP (%esi)
1024L(shl_15_loop):
1025
1026 movdqa 16(%eax, %edi), %xmm2
1027 sub $32, %ecx
1028 movdqa 32(%eax, %edi), %xmm3
1029 movdqa %xmm3, %xmm4
1030 palignr $15, %xmm2, %xmm3
1031 palignr $15, %xmm1, %xmm2
1032 lea 32(%edi), %edi
1033 movdqa %xmm2, -32(%edx, %edi)
1034 movdqa %xmm3, -16(%edx, %edi)
1035
1036 jb L(shl_15_end)
1037
1038 movdqa 16(%eax, %edi), %xmm2
1039 sub $32, %ecx
1040 movdqa 32(%eax, %edi), %xmm3
1041 movdqa %xmm3, %xmm1
1042 palignr $15, %xmm2, %xmm3
1043 palignr $15, %xmm4, %xmm2
1044 lea 32(%edi), %edi
1045 movdqa %xmm2, -32(%edx, %edi)
1046 movdqa %xmm3, -16(%edx, %edi)
1047
1048 jae L(shl_15_loop)
1049
1050L(shl_15_end):
1051 add $32, %ecx
1052 add %ecx, %edi
1053 add %edi, %edx
1054 lea 15(%edi, %eax), %eax
1055 POP (%edi)
1056 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1057
1058
1059 ALIGN (4)
1060L(fwd_write_44bytes):
1061 movl -44(%eax), %ecx
1062 movl %ecx, -44(%edx)
1063L(fwd_write_40bytes):
1064 movl -40(%eax), %ecx
1065 movl %ecx, -40(%edx)
1066L(fwd_write_36bytes):
1067 movl -36(%eax), %ecx
1068 movl %ecx, -36(%edx)
1069L(fwd_write_32bytes):
1070 movl -32(%eax), %ecx
1071 movl %ecx, -32(%edx)
1072L(fwd_write_28bytes):
1073 movl -28(%eax), %ecx
1074 movl %ecx, -28(%edx)
1075L(fwd_write_24bytes):
1076 movl -24(%eax), %ecx
1077 movl %ecx, -24(%edx)
1078L(fwd_write_20bytes):
1079 movl -20(%eax), %ecx
1080 movl %ecx, -20(%edx)
1081L(fwd_write_16bytes):
1082 movl -16(%eax), %ecx
1083 movl %ecx, -16(%edx)
1084L(fwd_write_12bytes):
1085 movl -12(%eax), %ecx
1086 movl %ecx, -12(%edx)
1087L(fwd_write_8bytes):
1088 movl -8(%eax), %ecx
1089 movl %ecx, -8(%edx)
1090L(fwd_write_4bytes):
1091 movl -4(%eax), %ecx
1092 movl %ecx, -4(%edx)
1093L(fwd_write_0bytes):
1094#ifdef USE_AS_MEMPCPY
1095 movl %edx, %eax
1096#else
1097 movl DEST(%esp), %eax
1098#endif
1099 RETURN
1100
1101 ALIGN (4)
1102L(fwd_write_5bytes):
1103 movl -5(%eax), %ecx
1104 movl -4(%eax), %eax
1105 movl %ecx, -5(%edx)
1106 movl %eax, -4(%edx)
1107#ifdef USE_AS_MEMPCPY
1108 movl %edx, %eax
1109#else
1110 movl DEST(%esp), %eax
1111#endif
1112 RETURN
1113
1114 ALIGN (4)
1115L(fwd_write_45bytes):
1116 movl -45(%eax), %ecx
1117 movl %ecx, -45(%edx)
1118L(fwd_write_41bytes):
1119 movl -41(%eax), %ecx
1120 movl %ecx, -41(%edx)
1121L(fwd_write_37bytes):
1122 movl -37(%eax), %ecx
1123 movl %ecx, -37(%edx)
1124L(fwd_write_33bytes):
1125 movl -33(%eax), %ecx
1126 movl %ecx, -33(%edx)
1127L(fwd_write_29bytes):
1128 movl -29(%eax), %ecx
1129 movl %ecx, -29(%edx)
1130L(fwd_write_25bytes):
1131 movl -25(%eax), %ecx
1132 movl %ecx, -25(%edx)
1133L(fwd_write_21bytes):
1134 movl -21(%eax), %ecx
1135 movl %ecx, -21(%edx)
1136L(fwd_write_17bytes):
1137 movl -17(%eax), %ecx
1138 movl %ecx, -17(%edx)
1139L(fwd_write_13bytes):
1140 movl -13(%eax), %ecx
1141 movl %ecx, -13(%edx)
1142L(fwd_write_9bytes):
1143 movl -9(%eax), %ecx
1144 movl %ecx, -9(%edx)
1145 movl -5(%eax), %ecx
1146 movl %ecx, -5(%edx)
1147L(fwd_write_1bytes):
1148 movzbl -1(%eax), %ecx
1149 movb %cl, -1(%edx)
1150#ifdef USE_AS_MEMPCPY
1151 movl %edx, %eax
1152#else
1153 movl DEST(%esp), %eax
1154#endif
1155 RETURN
1156
1157 ALIGN (4)
1158L(fwd_write_46bytes):
1159 movl -46(%eax), %ecx
1160 movl %ecx, -46(%edx)
1161L(fwd_write_42bytes):
1162 movl -42(%eax), %ecx
1163 movl %ecx, -42(%edx)
1164L(fwd_write_38bytes):
1165 movl -38(%eax), %ecx
1166 movl %ecx, -38(%edx)
1167L(fwd_write_34bytes):
1168 movl -34(%eax), %ecx
1169 movl %ecx, -34(%edx)
1170L(fwd_write_30bytes):
1171 movl -30(%eax), %ecx
1172 movl %ecx, -30(%edx)
1173L(fwd_write_26bytes):
1174 movl -26(%eax), %ecx
1175 movl %ecx, -26(%edx)
1176L(fwd_write_22bytes):
1177 movl -22(%eax), %ecx
1178 movl %ecx, -22(%edx)
1179L(fwd_write_18bytes):
1180 movl -18(%eax), %ecx
1181 movl %ecx, -18(%edx)
1182L(fwd_write_14bytes):
1183 movl -14(%eax), %ecx
1184 movl %ecx, -14(%edx)
1185L(fwd_write_10bytes):
1186 movl -10(%eax), %ecx
1187 movl %ecx, -10(%edx)
1188L(fwd_write_6bytes):
1189 movl -6(%eax), %ecx
1190 movl %ecx, -6(%edx)
1191L(fwd_write_2bytes):
1192 movzwl -2(%eax), %ecx
1193 movw %cx, -2(%edx)
1194#ifdef USE_AS_MEMPCPY
1195 movl %edx, %eax
1196#else
1197 movl DEST(%esp), %eax
1198#endif
1199 RETURN
1200
1201 ALIGN (4)
1202L(fwd_write_47bytes):
1203 movl -47(%eax), %ecx
1204 movl %ecx, -47(%edx)
1205L(fwd_write_43bytes):
1206 movl -43(%eax), %ecx
1207 movl %ecx, -43(%edx)
1208L(fwd_write_39bytes):
1209 movl -39(%eax), %ecx
1210 movl %ecx, -39(%edx)
1211L(fwd_write_35bytes):
1212 movl -35(%eax), %ecx
1213 movl %ecx, -35(%edx)
1214L(fwd_write_31bytes):
1215 movl -31(%eax), %ecx
1216 movl %ecx, -31(%edx)
1217L(fwd_write_27bytes):
1218 movl -27(%eax), %ecx
1219 movl %ecx, -27(%edx)
1220L(fwd_write_23bytes):
1221 movl -23(%eax), %ecx
1222 movl %ecx, -23(%edx)
1223L(fwd_write_19bytes):
1224 movl -19(%eax), %ecx
1225 movl %ecx, -19(%edx)
1226L(fwd_write_15bytes):
1227 movl -15(%eax), %ecx
1228 movl %ecx, -15(%edx)
1229L(fwd_write_11bytes):
1230 movl -11(%eax), %ecx
1231 movl %ecx, -11(%edx)
1232L(fwd_write_7bytes):
1233 movl -7(%eax), %ecx
1234 movl %ecx, -7(%edx)
1235L(fwd_write_3bytes):
1236 movzwl -3(%eax), %ecx
1237 movzbl -1(%eax), %eax
1238 movw %cx, -3(%edx)
1239 movb %al, -1(%edx)
1240#ifdef USE_AS_MEMPCPY
1241 movl %edx, %eax
1242#else
1243 movl DEST(%esp), %eax
1244#endif
1245 RETURN_END
1246
1247 cfi_restore_state
1248 cfi_remember_state
1249 ALIGN (4)
1250L(large_page):
1251 movdqu (%eax), %xmm1
1252 movdqu %xmm0, (%esi)
1253 movntdq %xmm1, (%edx)
1254 add $0x10, %eax
1255 add $0x10, %edx
1256 sub $0x10, %ecx
1257 cmp %al, %dl
1258 je L(copy_page_by_rep)
1259L(large_page_loop_init):
1260 POP (%esi)
1261 sub $0x80, %ecx
1262 POP (%edi)
1263L(large_page_loop):
1264 prefetchnta 0x1c0(%eax)
1265 prefetchnta 0x280(%eax)
1266 movdqu (%eax), %xmm0
1267 movdqu 0x10(%eax), %xmm1
1268 movdqu 0x20(%eax), %xmm2
1269 movdqu 0x30(%eax), %xmm3
1270 movdqu 0x40(%eax), %xmm4
1271 movdqu 0x50(%eax), %xmm5
1272 movdqu 0x60(%eax), %xmm6
1273 movdqu 0x70(%eax), %xmm7
1274 lea 0x80(%eax), %eax
1275 lfence
1276 sub $0x80, %ecx
1277 movntdq %xmm0, (%edx)
1278 movntdq %xmm1, 0x10(%edx)
1279 movntdq %xmm2, 0x20(%edx)
1280 movntdq %xmm3, 0x30(%edx)
1281 movntdq %xmm4, 0x40(%edx)
1282 movntdq %xmm5, 0x50(%edx)
1283 movntdq %xmm6, 0x60(%edx)
1284 movntdq %xmm7, 0x70(%edx)
1285 lea 0x80(%edx), %edx
1286 jae L(large_page_loop)
1287 add $0x80, %ecx
1288 cmp $0x40, %ecx
1289 jb L(large_page_less_64bytes)
1290
1291 movdqu (%eax), %xmm0
1292 movdqu 0x10(%eax), %xmm1
1293 movdqu 0x20(%eax), %xmm2
1294 movdqu 0x30(%eax), %xmm3
1295 lea 0x40(%eax), %eax
1296
1297 movntdq %xmm0, (%edx)
1298 movntdq %xmm1, 0x10(%edx)
1299 movntdq %xmm2, 0x20(%edx)
1300 movntdq %xmm3, 0x30(%edx)
1301 lea 0x40(%edx), %edx
1302 sub $0x40, %ecx
1303L(large_page_less_64bytes):
1304 cmp $32, %ecx
1305 jb L(large_page_less_32bytes)
1306 movdqu (%eax), %xmm0
1307 movdqu 0x10(%eax), %xmm1
1308 lea 0x20(%eax), %eax
1309 movntdq %xmm0, (%edx)
1310 movntdq %xmm1, 0x10(%edx)
1311 lea 0x20(%edx), %edx
1312 sub $0x20, %ecx
1313L(large_page_less_32bytes):
1314 add %ecx, %edx
1315 add %ecx, %eax
1316 sfence
1317 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1318
1319 cfi_restore_state
1320 cfi_remember_state
1321 ALIGN (4)
1322L(copy_page_by_rep):
1323 mov %eax, %esi
1324 mov %edx, %edi
1325 mov %ecx, %edx
1326 shr $2, %ecx
1327 and $3, %edx
1328 rep movsl
1329 jz L(copy_page_by_rep_exit)
1330 cmp $2, %edx
1331 jb L(copy_page_by_rep_left_1)
1332 movzwl (%esi), %eax
1333 movw %ax, (%edi)
1334 add $2, %esi
1335 add $2, %edi
1336 sub $2, %edx
1337 jz L(copy_page_by_rep_exit)
1338L(copy_page_by_rep_left_1):
1339 movzbl (%esi), %eax
1340 movb %al, (%edi)
1341L(copy_page_by_rep_exit):
1342 POP (%esi)
1343 POP (%edi)
1344 movl DEST(%esp), %eax
1345#ifdef USE_AS_MEMPCPY
1346 movl LEN(%esp), %ecx
1347 add %ecx, %eax
1348#endif
1349 RETURN
1350
1351 ALIGN (4)
1352L(bk_write_44bytes):
1353 movl 40(%eax), %ecx
1354 movl %ecx, 40(%edx)
1355L(bk_write_40bytes):
1356 movl 36(%eax), %ecx
1357 movl %ecx, 36(%edx)
1358L(bk_write_36bytes):
1359 movl 32(%eax), %ecx
1360 movl %ecx, 32(%edx)
1361L(bk_write_32bytes):
1362 movl 28(%eax), %ecx
1363 movl %ecx, 28(%edx)
1364L(bk_write_28bytes):
1365 movl 24(%eax), %ecx
1366 movl %ecx, 24(%edx)
1367L(bk_write_24bytes):
1368 movl 20(%eax), %ecx
1369 movl %ecx, 20(%edx)
1370L(bk_write_20bytes):
1371 movl 16(%eax), %ecx
1372 movl %ecx, 16(%edx)
1373L(bk_write_16bytes):
1374 movl 12(%eax), %ecx
1375 movl %ecx, 12(%edx)
1376L(bk_write_12bytes):
1377 movl 8(%eax), %ecx
1378 movl %ecx, 8(%edx)
1379L(bk_write_8bytes):
1380 movl 4(%eax), %ecx
1381 movl %ecx, 4(%edx)
1382L(bk_write_4bytes):
1383 movl (%eax), %ecx
1384 movl %ecx, (%edx)
1385L(bk_write_0bytes):
1386 movl DEST(%esp), %eax
1387#ifdef USE_AS_MEMPCPY
1388 movl LEN(%esp), %ecx
1389 add %ecx, %eax
1390#endif
1391 RETURN
1392
1393 ALIGN (4)
1394L(bk_write_45bytes):
1395 movl 41(%eax), %ecx
1396 movl %ecx, 41(%edx)
1397L(bk_write_41bytes):
1398 movl 37(%eax), %ecx
1399 movl %ecx, 37(%edx)
1400L(bk_write_37bytes):
1401 movl 33(%eax), %ecx
1402 movl %ecx, 33(%edx)
1403L(bk_write_33bytes):
1404 movl 29(%eax), %ecx
1405 movl %ecx, 29(%edx)
1406L(bk_write_29bytes):
1407 movl 25(%eax), %ecx
1408 movl %ecx, 25(%edx)
1409L(bk_write_25bytes):
1410 movl 21(%eax), %ecx
1411 movl %ecx, 21(%edx)
1412L(bk_write_21bytes):
1413 movl 17(%eax), %ecx
1414 movl %ecx, 17(%edx)
1415L(bk_write_17bytes):
1416 movl 13(%eax), %ecx
1417 movl %ecx, 13(%edx)
1418L(bk_write_13bytes):
1419 movl 9(%eax), %ecx
1420 movl %ecx, 9(%edx)
1421L(bk_write_9bytes):
1422 movl 5(%eax), %ecx
1423 movl %ecx, 5(%edx)
1424L(bk_write_5bytes):
1425 movl 1(%eax), %ecx
1426 movl %ecx, 1(%edx)
1427L(bk_write_1bytes):
1428 movzbl (%eax), %ecx
1429 movb %cl, (%edx)
1430 movl DEST(%esp), %eax
1431#ifdef USE_AS_MEMPCPY
1432 movl LEN(%esp), %ecx
1433 add %ecx, %eax
1434#endif
1435 RETURN
1436
1437 ALIGN (4)
1438L(bk_write_46bytes):
1439 movl 42(%eax), %ecx
1440 movl %ecx, 42(%edx)
1441L(bk_write_42bytes):
1442 movl 38(%eax), %ecx
1443 movl %ecx, 38(%edx)
1444L(bk_write_38bytes):
1445 movl 34(%eax), %ecx
1446 movl %ecx, 34(%edx)
1447L(bk_write_34bytes):
1448 movl 30(%eax), %ecx
1449 movl %ecx, 30(%edx)
1450L(bk_write_30bytes):
1451 movl 26(%eax), %ecx
1452 movl %ecx, 26(%edx)
1453L(bk_write_26bytes):
1454 movl 22(%eax), %ecx
1455 movl %ecx, 22(%edx)
1456L(bk_write_22bytes):
1457 movl 18(%eax), %ecx
1458 movl %ecx, 18(%edx)
1459L(bk_write_18bytes):
1460 movl 14(%eax), %ecx
1461 movl %ecx, 14(%edx)
1462L(bk_write_14bytes):
1463 movl 10(%eax), %ecx
1464 movl %ecx, 10(%edx)
1465L(bk_write_10bytes):
1466 movl 6(%eax), %ecx
1467 movl %ecx, 6(%edx)
1468L(bk_write_6bytes):
1469 movl 2(%eax), %ecx
1470 movl %ecx, 2(%edx)
1471L(bk_write_2bytes):
1472 movzwl (%eax), %ecx
1473 movw %cx, (%edx)
1474 movl DEST(%esp), %eax
1475#ifdef USE_AS_MEMPCPY
1476 movl LEN(%esp), %ecx
1477 add %ecx, %eax
1478#endif
1479 RETURN
1480
1481 ALIGN (4)
1482L(bk_write_47bytes):
1483 movl 43(%eax), %ecx
1484 movl %ecx, 43(%edx)
1485L(bk_write_43bytes):
1486 movl 39(%eax), %ecx
1487 movl %ecx, 39(%edx)
1488L(bk_write_39bytes):
1489 movl 35(%eax), %ecx
1490 movl %ecx, 35(%edx)
1491L(bk_write_35bytes):
1492 movl 31(%eax), %ecx
1493 movl %ecx, 31(%edx)
1494L(bk_write_31bytes):
1495 movl 27(%eax), %ecx
1496 movl %ecx, 27(%edx)
1497L(bk_write_27bytes):
1498 movl 23(%eax), %ecx
1499 movl %ecx, 23(%edx)
1500L(bk_write_23bytes):
1501 movl 19(%eax), %ecx
1502 movl %ecx, 19(%edx)
1503L(bk_write_19bytes):
1504 movl 15(%eax), %ecx
1505 movl %ecx, 15(%edx)
1506L(bk_write_15bytes):
1507 movl 11(%eax), %ecx
1508 movl %ecx, 11(%edx)
1509L(bk_write_11bytes):
1510 movl 7(%eax), %ecx
1511 movl %ecx, 7(%edx)
1512L(bk_write_7bytes):
1513 movl 3(%eax), %ecx
1514 movl %ecx, 3(%edx)
1515L(bk_write_3bytes):
1516 movzwl 1(%eax), %ecx
1517 movw %cx, 1(%edx)
1518 movzbl (%eax), %eax
1519 movb %al, (%edx)
1520 movl DEST(%esp), %eax
1521#ifdef USE_AS_MEMPCPY
1522 movl LEN(%esp), %ecx
1523 add %ecx, %eax
1524#endif
1525 RETURN_END
1526
1527
1528 .pushsection .rodata.ssse3,"a",@progbits
1529 ALIGN (2)
1530L(table_48bytes_fwd):
1531 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1532 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1533 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1534 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1535 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1536 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1537 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1538 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1539 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1540 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1541 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1542 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1543 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1544 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1545 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1546 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1547 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1548 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1549 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1550 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1551 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1552 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1553 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1554 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1555 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1556 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1557 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1558 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1559 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1560 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1561 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1562 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1563 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1564 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1565 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1566 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1567 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1568 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1569 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1570 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1571 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1572 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1573 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1574 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1575 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1576 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1577 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1578 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1579
1580 ALIGN (2)
1581L(shl_table):
1582 .int JMPTBL (L(shl_0), L(shl_table))
1583 .int JMPTBL (L(shl_1), L(shl_table))
1584 .int JMPTBL (L(shl_2), L(shl_table))
1585 .int JMPTBL (L(shl_3), L(shl_table))
1586 .int JMPTBL (L(shl_4), L(shl_table))
1587 .int JMPTBL (L(shl_5), L(shl_table))
1588 .int JMPTBL (L(shl_6), L(shl_table))
1589 .int JMPTBL (L(shl_7), L(shl_table))
1590 .int JMPTBL (L(shl_8), L(shl_table))
1591 .int JMPTBL (L(shl_9), L(shl_table))
1592 .int JMPTBL (L(shl_10), L(shl_table))
1593 .int JMPTBL (L(shl_11), L(shl_table))
1594 .int JMPTBL (L(shl_12), L(shl_table))
1595 .int JMPTBL (L(shl_13), L(shl_table))
1596 .int JMPTBL (L(shl_14), L(shl_table))
1597 .int JMPTBL (L(shl_15), L(shl_table))
1598
1599 ALIGN (2)
1600L(table_48_bytes_bwd):
1601 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1602 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1603 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1604 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1605 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1606 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1607 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1608 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1609 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1610 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1611 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1612 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1613 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1614 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1615 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1616 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1617 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1618 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1619 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1620 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1621 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1622 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1623 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1624 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1625 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1626 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1627 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1628 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1629 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1630 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1631 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1632 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1633 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1634 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1635 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1636 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1637 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1638 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1639 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1640 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1641 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1642 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1643 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1644 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1645 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1646 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1647 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1648 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1649
1650 .popsection
1651
1652#ifdef USE_AS_MEMMOVE
1653 ALIGN (4)
1654L(copy_backward):
1655 PUSH (%esi)
1656 movl %eax, %esi
1657 add %ecx, %edx
1658 add %ecx, %esi
1659 testl $0x3, %edx
1660 jnz L(bk_align)
1661
1662L(bk_aligned_4):
1663 cmp $64, %ecx
1664 jae L(bk_write_more64bytes)
1665
1666L(bk_write_64bytesless):
1667 cmp $32, %ecx
1668 jb L(bk_write_less32bytes)
1669
1670L(bk_write_more32bytes):
1671 /* Copy 32 bytes at a time. */
1672 sub $32, %ecx
1673 movl -4(%esi), %eax
1674 movl %eax, -4(%edx)
1675 movl -8(%esi), %eax
1676 movl %eax, -8(%edx)
1677 movl -12(%esi), %eax
1678 movl %eax, -12(%edx)
1679 movl -16(%esi), %eax
1680 movl %eax, -16(%edx)
1681 movl -20(%esi), %eax
1682 movl %eax, -20(%edx)
1683 movl -24(%esi), %eax
1684 movl %eax, -24(%edx)
1685 movl -28(%esi), %eax
1686 movl %eax, -28(%edx)
1687 movl -32(%esi), %eax
1688 movl %eax, -32(%edx)
1689 sub $32, %edx
1690 sub $32, %esi
1691
1692L(bk_write_less32bytes):
1693 movl %esi, %eax
1694 sub %ecx, %edx
1695 sub %ecx, %eax
1696 POP (%esi)
1697L(bk_write_less48bytes):
1698 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1699
1700 CFI_PUSH (%esi)
1701 ALIGN (4)
1702L(bk_align):
1703 cmp $8, %ecx
1704 jbe L(bk_write_less32bytes)
1705 testl $1, %edx
1706 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1707 then (EDX & 2) must be != 0. */
1708 jz L(bk_got2)
1709 sub $1, %esi
1710 sub $1, %ecx
1711 sub $1, %edx
1712 movzbl (%esi), %eax
1713 movb %al, (%edx)
1714
1715 testl $2, %edx
1716 jz L(bk_aligned_4)
1717
1718L(bk_got2):
1719 sub $2, %esi
1720 sub $2, %ecx
1721 sub $2, %edx
1722 movzwl (%esi), %eax
1723 movw %ax, (%edx)
1724 jmp L(bk_aligned_4)
1725
1726 ALIGN (4)
1727L(bk_write_more64bytes):
1728 /* Check alignment of last byte. */
1729 testl $15, %edx
1730 jz L(bk_ssse3_cpy_pre)
1731
1732/* EDX is aligned 4 bytes, but not 16 bytes. */
1733L(bk_ssse3_align):
1734 sub $4, %esi
1735 sub $4, %ecx
1736 sub $4, %edx
1737 movl (%esi), %eax
1738 movl %eax, (%edx)
1739
1740 testl $15, %edx
1741 jz L(bk_ssse3_cpy_pre)
1742
1743 sub $4, %esi
1744 sub $4, %ecx
1745 sub $4, %edx
1746 movl (%esi), %eax
1747 movl %eax, (%edx)
1748
1749 testl $15, %edx
1750 jz L(bk_ssse3_cpy_pre)
1751
1752 sub $4, %esi
1753 sub $4, %ecx
1754 sub $4, %edx
1755 movl (%esi), %eax
1756 movl %eax, (%edx)
1757
1758L(bk_ssse3_cpy_pre):
1759 cmp $64, %ecx
1760 jb L(bk_write_more32bytes)
1761
1762L(bk_ssse3_cpy):
1763 sub $64, %esi
1764 sub $64, %ecx
1765 sub $64, %edx
1766 movdqu 0x30(%esi), %xmm3
1767 movdqa %xmm3, 0x30(%edx)
1768 movdqu 0x20(%esi), %xmm2
1769 movdqa %xmm2, 0x20(%edx)
1770 movdqu 0x10(%esi), %xmm1
1771 movdqa %xmm1, 0x10(%edx)
1772 movdqu (%esi), %xmm0
1773 movdqa %xmm0, (%edx)
1774 cmp $64, %ecx
1775 jae L(bk_ssse3_cpy)
1776 jmp L(bk_write_64bytesless)
1777
1778#endif
1779
1780END (MEMCPY)
1781
1782#endif
1783

source code of glibc/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S