1 | /* memcpy with SSSE3 and REP string. |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #if IS_IN (libc) \ |
22 | && (defined SHARED \ |
23 | || defined USE_AS_MEMMOVE \ |
24 | || !defined USE_MULTIARCH) |
25 | |
26 | #include "asm-syntax.h" |
27 | |
28 | #ifndef MEMCPY |
29 | # define MEMCPY __memcpy_ssse3_rep |
30 | # define MEMCPY_CHK __memcpy_chk_ssse3_rep |
31 | #endif |
32 | |
33 | #define DEST PARMS |
34 | #define SRC DEST+4 |
35 | #define LEN SRC+4 |
36 | |
37 | #define CFI_PUSH(REG) \ |
38 | cfi_adjust_cfa_offset (4); \ |
39 | cfi_rel_offset (REG, 0) |
40 | |
41 | #define CFI_POP(REG) \ |
42 | cfi_adjust_cfa_offset (-4); \ |
43 | cfi_restore (REG) |
44 | |
45 | #define PUSH(REG) pushl REG; CFI_PUSH (REG) |
46 | #define POP(REG) popl REG; CFI_POP (REG) |
47 | |
48 | #ifdef PIC |
49 | # define PARMS 8 /* Preserve EBX. */ |
50 | # define ENTRANCE PUSH (%ebx); |
51 | # define RETURN_END POP (%ebx); ret |
52 | # define RETURN RETURN_END; CFI_PUSH (%ebx) |
53 | # define JMPTBL(I, B) I - B |
54 | |
55 | /* Load an entry in a jump table into EBX and branch to it. TABLE is a |
56 | jump table with relative offsets. INDEX is a register contains the |
57 | index into the jump table. SCALE is the scale of INDEX. */ |
58 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ |
59 | /* We first load PC into EBX. */ \ |
60 | SETUP_PIC_REG(bx); \ |
61 | /* Get the address of the jump table. */ \ |
62 | addl $(TABLE - .), %ebx; \ |
63 | /* Get the entry and convert the relative offset to the \ |
64 | absolute address. */ \ |
65 | addl (%ebx,INDEX,SCALE), %ebx; \ |
66 | /* We loaded the jump table. Go. */ \ |
67 | jmp *%ebx |
68 | |
69 | # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ |
70 | addl $(TABLE - .), %ebx |
71 | |
72 | # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ |
73 | addl (%ebx,INDEX,SCALE), %ebx; \ |
74 | /* We loaded the jump table. Go. */ \ |
75 | jmp *%ebx |
76 | #else |
77 | # define PARMS 4 |
78 | # define ENTRANCE |
79 | # define RETURN_END ret |
80 | # define RETURN RETURN_END |
81 | # define JMPTBL(I, B) I |
82 | |
83 | /* Branch to an entry in a jump table. TABLE is a jump table with |
84 | absolute offsets. INDEX is a register contains the index into the |
85 | jump table. SCALE is the scale of INDEX. */ |
86 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ |
87 | jmp *TABLE(,INDEX,SCALE) |
88 | |
89 | # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) |
90 | |
91 | # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ |
92 | jmp *TABLE(,INDEX,SCALE) |
93 | #endif |
94 | |
95 | .section .text.ssse3,"ax" ,@progbits |
96 | #ifdef SHARED |
97 | ENTRY (MEMCPY_CHK) |
98 | movl 12(%esp), %eax |
99 | cmpl %eax, 16(%esp) |
100 | jb HIDDEN_JUMPTARGET (__chk_fail) |
101 | END (MEMCPY_CHK) |
102 | #endif |
103 | ENTRY (MEMCPY) |
104 | ENTRANCE |
105 | movl LEN(%esp), %ecx |
106 | movl SRC(%esp), %eax |
107 | movl DEST(%esp), %edx |
108 | |
109 | #ifdef USE_AS_MEMMOVE |
110 | cmp %eax, %edx |
111 | jb L(copy_forward) |
112 | je L(fwd_write_0bytes) |
113 | cmp $48, %ecx |
114 | jb L(bk_write_less48bytes) |
115 | add %ecx, %eax |
116 | cmp %eax, %edx |
117 | movl SRC(%esp), %eax |
118 | jb L(copy_backward) |
119 | |
120 | L(copy_forward): |
121 | #endif |
122 | cmp $48, %ecx |
123 | jae L(48bytesormore) |
124 | |
125 | L(fwd_write_less32bytes): |
126 | #ifndef USE_AS_MEMMOVE |
127 | cmp %dl, %al |
128 | jb L(bk_write) |
129 | #endif |
130 | add %ecx, %edx |
131 | add %ecx, %eax |
132 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
133 | #ifndef USE_AS_MEMMOVE |
134 | L(bk_write): |
135 | BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) |
136 | #endif |
137 | |
138 | ALIGN (4) |
139 | /* ECX > 32 and EDX is 4 byte aligned. */ |
140 | L(48bytesormore): |
141 | movdqu (%eax), %xmm0 |
142 | PUSH (%edi) |
143 | movl %edx, %edi |
144 | and $-16, %edx |
145 | PUSH (%esi) |
146 | cfi_remember_state |
147 | add $16, %edx |
148 | movl %edi, %esi |
149 | sub %edx, %edi |
150 | add %edi, %ecx |
151 | sub %edi, %eax |
152 | |
153 | #ifdef SHARED_CACHE_SIZE_HALF |
154 | cmp $SHARED_CACHE_SIZE_HALF, %ecx |
155 | #else |
156 | # ifdef PIC |
157 | SETUP_PIC_REG(bx) |
158 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
159 | cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx |
160 | # else |
161 | cmp __x86_shared_cache_size_half, %ecx |
162 | # endif |
163 | #endif |
164 | |
165 | mov %eax, %edi |
166 | jae L(large_page) |
167 | and $0xf, %edi |
168 | jz L(shl_0) |
169 | |
170 | BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) |
171 | |
172 | ALIGN (4) |
173 | L(shl_0): |
174 | movdqu %xmm0, (%esi) |
175 | xor %edi, %edi |
176 | cmp $127, %ecx |
177 | ja L(shl_0_gobble) |
178 | lea -32(%ecx), %ecx |
179 | L(shl_0_loop): |
180 | movdqa (%eax, %edi), %xmm0 |
181 | movdqa 16(%eax, %edi), %xmm1 |
182 | sub $32, %ecx |
183 | movdqa %xmm0, (%edx, %edi) |
184 | movdqa %xmm1, 16(%edx, %edi) |
185 | lea 32(%edi), %edi |
186 | jb L(shl_0_end) |
187 | |
188 | movdqa (%eax, %edi), %xmm0 |
189 | movdqa 16(%eax, %edi), %xmm1 |
190 | sub $32, %ecx |
191 | movdqa %xmm0, (%edx, %edi) |
192 | movdqa %xmm1, 16(%edx, %edi) |
193 | lea 32(%edi), %edi |
194 | jb L(shl_0_end) |
195 | |
196 | movdqa (%eax, %edi), %xmm0 |
197 | movdqa 16(%eax, %edi), %xmm1 |
198 | sub $32, %ecx |
199 | movdqa %xmm0, (%edx, %edi) |
200 | movdqa %xmm1, 16(%edx, %edi) |
201 | lea 32(%edi), %edi |
202 | jb L(shl_0_end) |
203 | |
204 | movdqa (%eax, %edi), %xmm0 |
205 | movdqa 16(%eax, %edi), %xmm1 |
206 | sub $32, %ecx |
207 | movdqa %xmm0, (%edx, %edi) |
208 | movdqa %xmm1, 16(%edx, %edi) |
209 | lea 32(%edi), %edi |
210 | L(shl_0_end): |
211 | lea 32(%ecx), %ecx |
212 | add %ecx, %edi |
213 | add %edi, %edx |
214 | add %edi, %eax |
215 | POP (%esi) |
216 | POP (%edi) |
217 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
218 | |
219 | cfi_restore_state |
220 | cfi_remember_state |
221 | L(shl_0_gobble): |
222 | |
223 | #ifdef DATA_CACHE_SIZE_HALF |
224 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
225 | #else |
226 | # ifdef PIC |
227 | SETUP_PIC_REG(bx) |
228 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
229 | mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi |
230 | # else |
231 | mov __x86_data_cache_size_half, %edi |
232 | # endif |
233 | #endif |
234 | mov %edi, %esi |
235 | shr $3, %esi |
236 | sub %esi, %edi |
237 | cmp %edi, %ecx |
238 | jae L(shl_0_gobble_mem_start) |
239 | sub $128, %ecx |
240 | ALIGN (4) |
241 | L(shl_0_gobble_cache_loop): |
242 | movdqa (%eax), %xmm0 |
243 | movaps 0x10(%eax), %xmm1 |
244 | movaps 0x20(%eax), %xmm2 |
245 | movaps 0x30(%eax), %xmm3 |
246 | movaps 0x40(%eax), %xmm4 |
247 | movaps 0x50(%eax), %xmm5 |
248 | movaps 0x60(%eax), %xmm6 |
249 | movaps 0x70(%eax), %xmm7 |
250 | lea 0x80(%eax), %eax |
251 | sub $128, %ecx |
252 | movdqa %xmm0, (%edx) |
253 | movaps %xmm1, 0x10(%edx) |
254 | movaps %xmm2, 0x20(%edx) |
255 | movaps %xmm3, 0x30(%edx) |
256 | movaps %xmm4, 0x40(%edx) |
257 | movaps %xmm5, 0x50(%edx) |
258 | movaps %xmm6, 0x60(%edx) |
259 | movaps %xmm7, 0x70(%edx) |
260 | lea 0x80(%edx), %edx |
261 | |
262 | jae L(shl_0_gobble_cache_loop) |
263 | add $0x80, %ecx |
264 | cmp $0x40, %ecx |
265 | jb L(shl_0_cache_less_64bytes) |
266 | |
267 | movdqa (%eax), %xmm0 |
268 | sub $0x40, %ecx |
269 | movdqa 0x10(%eax), %xmm1 |
270 | |
271 | movdqa %xmm0, (%edx) |
272 | movdqa %xmm1, 0x10(%edx) |
273 | |
274 | movdqa 0x20(%eax), %xmm0 |
275 | movdqa 0x30(%eax), %xmm1 |
276 | add $0x40, %eax |
277 | |
278 | movdqa %xmm0, 0x20(%edx) |
279 | movdqa %xmm1, 0x30(%edx) |
280 | add $0x40, %edx |
281 | L(shl_0_cache_less_64bytes): |
282 | cmp $0x20, %ecx |
283 | jb L(shl_0_cache_less_32bytes) |
284 | movdqa (%eax), %xmm0 |
285 | sub $0x20, %ecx |
286 | movdqa 0x10(%eax), %xmm1 |
287 | add $0x20, %eax |
288 | movdqa %xmm0, (%edx) |
289 | movdqa %xmm1, 0x10(%edx) |
290 | add $0x20, %edx |
291 | L(shl_0_cache_less_32bytes): |
292 | cmp $0x10, %ecx |
293 | jb L(shl_0_cache_less_16bytes) |
294 | sub $0x10, %ecx |
295 | movdqa (%eax), %xmm0 |
296 | add $0x10, %eax |
297 | movdqa %xmm0, (%edx) |
298 | add $0x10, %edx |
299 | L(shl_0_cache_less_16bytes): |
300 | add %ecx, %edx |
301 | add %ecx, %eax |
302 | POP (%esi) |
303 | POP (%edi) |
304 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
305 | |
306 | cfi_restore_state |
307 | cfi_remember_state |
308 | ALIGN (4) |
309 | L(shl_0_gobble_mem_start): |
310 | cmp %al, %dl |
311 | je L(copy_page_by_rep) |
312 | sub $128, %ecx |
313 | L(shl_0_gobble_mem_loop): |
314 | prefetchnta 0x1c0(%eax) |
315 | prefetchnta 0x280(%eax) |
316 | prefetchnta 0x1c0(%edx) |
317 | prefetchnta 0x280(%edx) |
318 | |
319 | movdqa (%eax), %xmm0 |
320 | movaps 0x10(%eax), %xmm1 |
321 | movaps 0x20(%eax), %xmm2 |
322 | movaps 0x30(%eax), %xmm3 |
323 | movaps 0x40(%eax), %xmm4 |
324 | movaps 0x50(%eax), %xmm5 |
325 | movaps 0x60(%eax), %xmm6 |
326 | movaps 0x70(%eax), %xmm7 |
327 | lea 0x80(%eax), %eax |
328 | sub $0x80, %ecx |
329 | movdqa %xmm0, (%edx) |
330 | movaps %xmm1, 0x10(%edx) |
331 | movaps %xmm2, 0x20(%edx) |
332 | movaps %xmm3, 0x30(%edx) |
333 | movaps %xmm4, 0x40(%edx) |
334 | movaps %xmm5, 0x50(%edx) |
335 | movaps %xmm6, 0x60(%edx) |
336 | movaps %xmm7, 0x70(%edx) |
337 | lea 0x80(%edx), %edx |
338 | |
339 | jae L(shl_0_gobble_mem_loop) |
340 | add $0x80, %ecx |
341 | cmp $0x40, %ecx |
342 | jb L(shl_0_mem_less_64bytes) |
343 | |
344 | movdqa (%eax), %xmm0 |
345 | sub $0x40, %ecx |
346 | movdqa 0x10(%eax), %xmm1 |
347 | |
348 | movdqa %xmm0, (%edx) |
349 | movdqa %xmm1, 0x10(%edx) |
350 | |
351 | movdqa 0x20(%eax), %xmm0 |
352 | movdqa 0x30(%eax), %xmm1 |
353 | add $0x40, %eax |
354 | |
355 | movdqa %xmm0, 0x20(%edx) |
356 | movdqa %xmm1, 0x30(%edx) |
357 | add $0x40, %edx |
358 | L(shl_0_mem_less_64bytes): |
359 | cmp $0x20, %ecx |
360 | jb L(shl_0_mem_less_32bytes) |
361 | movdqa (%eax), %xmm0 |
362 | sub $0x20, %ecx |
363 | movdqa 0x10(%eax), %xmm1 |
364 | add $0x20, %eax |
365 | movdqa %xmm0, (%edx) |
366 | movdqa %xmm1, 0x10(%edx) |
367 | add $0x20, %edx |
368 | L(shl_0_mem_less_32bytes): |
369 | cmp $0x10, %ecx |
370 | jb L(shl_0_mem_less_16bytes) |
371 | sub $0x10, %ecx |
372 | movdqa (%eax), %xmm0 |
373 | add $0x10, %eax |
374 | movdqa %xmm0, (%edx) |
375 | add $0x10, %edx |
376 | L(shl_0_mem_less_16bytes): |
377 | add %ecx, %edx |
378 | add %ecx, %eax |
379 | POP (%esi) |
380 | POP (%edi) |
381 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
382 | |
383 | cfi_restore_state |
384 | cfi_remember_state |
385 | ALIGN (4) |
386 | L(shl_1): |
387 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
388 | sub $1, %eax |
389 | movaps (%eax), %xmm1 |
390 | xor %edi, %edi |
391 | sub $32, %ecx |
392 | movdqu %xmm0, (%esi) |
393 | POP (%esi) |
394 | L(shl_1_loop): |
395 | |
396 | movdqa 16(%eax, %edi), %xmm2 |
397 | sub $32, %ecx |
398 | movdqa 32(%eax, %edi), %xmm3 |
399 | movdqa %xmm3, %xmm4 |
400 | palignr $1, %xmm2, %xmm3 |
401 | palignr $1, %xmm1, %xmm2 |
402 | lea 32(%edi), %edi |
403 | movdqa %xmm2, -32(%edx, %edi) |
404 | movdqa %xmm3, -16(%edx, %edi) |
405 | |
406 | jb L(shl_1_end) |
407 | |
408 | movdqa 16(%eax, %edi), %xmm2 |
409 | sub $32, %ecx |
410 | movdqa 32(%eax, %edi), %xmm3 |
411 | movdqa %xmm3, %xmm1 |
412 | palignr $1, %xmm2, %xmm3 |
413 | palignr $1, %xmm4, %xmm2 |
414 | lea 32(%edi), %edi |
415 | movdqa %xmm2, -32(%edx, %edi) |
416 | movdqa %xmm3, -16(%edx, %edi) |
417 | |
418 | jae L(shl_1_loop) |
419 | |
420 | L(shl_1_end): |
421 | add $32, %ecx |
422 | add %ecx, %edi |
423 | add %edi, %edx |
424 | lea 1(%edi, %eax), %eax |
425 | POP (%edi) |
426 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
427 | |
428 | cfi_restore_state |
429 | cfi_remember_state |
430 | ALIGN (4) |
431 | L(shl_2): |
432 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
433 | sub $2, %eax |
434 | movaps (%eax), %xmm1 |
435 | xor %edi, %edi |
436 | sub $32, %ecx |
437 | movdqu %xmm0, (%esi) |
438 | POP (%esi) |
439 | L(shl_2_loop): |
440 | |
441 | movdqa 16(%eax, %edi), %xmm2 |
442 | sub $32, %ecx |
443 | movdqa 32(%eax, %edi), %xmm3 |
444 | movdqa %xmm3, %xmm4 |
445 | palignr $2, %xmm2, %xmm3 |
446 | palignr $2, %xmm1, %xmm2 |
447 | lea 32(%edi), %edi |
448 | movdqa %xmm2, -32(%edx, %edi) |
449 | movdqa %xmm3, -16(%edx, %edi) |
450 | |
451 | jb L(shl_2_end) |
452 | |
453 | movdqa 16(%eax, %edi), %xmm2 |
454 | sub $32, %ecx |
455 | movdqa 32(%eax, %edi), %xmm3 |
456 | movdqa %xmm3, %xmm1 |
457 | palignr $2, %xmm2, %xmm3 |
458 | palignr $2, %xmm4, %xmm2 |
459 | lea 32(%edi), %edi |
460 | movdqa %xmm2, -32(%edx, %edi) |
461 | movdqa %xmm3, -16(%edx, %edi) |
462 | |
463 | jae L(shl_2_loop) |
464 | |
465 | L(shl_2_end): |
466 | add $32, %ecx |
467 | add %ecx, %edi |
468 | add %edi, %edx |
469 | lea 2(%edi, %eax), %eax |
470 | POP (%edi) |
471 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
472 | |
473 | cfi_restore_state |
474 | cfi_remember_state |
475 | ALIGN (4) |
476 | L(shl_3): |
477 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
478 | sub $3, %eax |
479 | movaps (%eax), %xmm1 |
480 | xor %edi, %edi |
481 | sub $32, %ecx |
482 | movdqu %xmm0, (%esi) |
483 | POP (%esi) |
484 | L(shl_3_loop): |
485 | |
486 | movdqa 16(%eax, %edi), %xmm2 |
487 | sub $32, %ecx |
488 | movdqa 32(%eax, %edi), %xmm3 |
489 | movdqa %xmm3, %xmm4 |
490 | palignr $3, %xmm2, %xmm3 |
491 | palignr $3, %xmm1, %xmm2 |
492 | lea 32(%edi), %edi |
493 | movdqa %xmm2, -32(%edx, %edi) |
494 | movdqa %xmm3, -16(%edx, %edi) |
495 | |
496 | jb L(shl_3_end) |
497 | |
498 | movdqa 16(%eax, %edi), %xmm2 |
499 | sub $32, %ecx |
500 | movdqa 32(%eax, %edi), %xmm3 |
501 | movdqa %xmm3, %xmm1 |
502 | palignr $3, %xmm2, %xmm3 |
503 | palignr $3, %xmm4, %xmm2 |
504 | lea 32(%edi), %edi |
505 | movdqa %xmm2, -32(%edx, %edi) |
506 | movdqa %xmm3, -16(%edx, %edi) |
507 | |
508 | jae L(shl_3_loop) |
509 | |
510 | L(shl_3_end): |
511 | add $32, %ecx |
512 | add %ecx, %edi |
513 | add %edi, %edx |
514 | lea 3(%edi, %eax), %eax |
515 | POP (%edi) |
516 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
517 | |
518 | cfi_restore_state |
519 | cfi_remember_state |
520 | ALIGN (4) |
521 | L(shl_4): |
522 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
523 | sub $4, %eax |
524 | movaps (%eax), %xmm1 |
525 | xor %edi, %edi |
526 | sub $32, %ecx |
527 | movdqu %xmm0, (%esi) |
528 | POP (%esi) |
529 | L(shl_4_loop): |
530 | |
531 | movdqa 16(%eax, %edi), %xmm2 |
532 | sub $32, %ecx |
533 | movdqa 32(%eax, %edi), %xmm3 |
534 | movdqa %xmm3, %xmm4 |
535 | palignr $4, %xmm2, %xmm3 |
536 | palignr $4, %xmm1, %xmm2 |
537 | lea 32(%edi), %edi |
538 | movdqa %xmm2, -32(%edx, %edi) |
539 | movdqa %xmm3, -16(%edx, %edi) |
540 | |
541 | jb L(shl_4_end) |
542 | |
543 | movdqa 16(%eax, %edi), %xmm2 |
544 | sub $32, %ecx |
545 | movdqa 32(%eax, %edi), %xmm3 |
546 | movdqa %xmm3, %xmm1 |
547 | palignr $4, %xmm2, %xmm3 |
548 | palignr $4, %xmm4, %xmm2 |
549 | lea 32(%edi), %edi |
550 | movdqa %xmm2, -32(%edx, %edi) |
551 | movdqa %xmm3, -16(%edx, %edi) |
552 | |
553 | jae L(shl_4_loop) |
554 | |
555 | L(shl_4_end): |
556 | add $32, %ecx |
557 | add %ecx, %edi |
558 | add %edi, %edx |
559 | lea 4(%edi, %eax), %eax |
560 | POP (%edi) |
561 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
562 | |
563 | cfi_restore_state |
564 | cfi_remember_state |
565 | ALIGN (4) |
566 | L(shl_5): |
567 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
568 | sub $5, %eax |
569 | movaps (%eax), %xmm1 |
570 | xor %edi, %edi |
571 | sub $32, %ecx |
572 | movdqu %xmm0, (%esi) |
573 | POP (%esi) |
574 | L(shl_5_loop): |
575 | |
576 | movdqa 16(%eax, %edi), %xmm2 |
577 | sub $32, %ecx |
578 | movdqa 32(%eax, %edi), %xmm3 |
579 | movdqa %xmm3, %xmm4 |
580 | palignr $5, %xmm2, %xmm3 |
581 | palignr $5, %xmm1, %xmm2 |
582 | lea 32(%edi), %edi |
583 | movdqa %xmm2, -32(%edx, %edi) |
584 | movdqa %xmm3, -16(%edx, %edi) |
585 | |
586 | jb L(shl_5_end) |
587 | |
588 | movdqa 16(%eax, %edi), %xmm2 |
589 | sub $32, %ecx |
590 | movdqa 32(%eax, %edi), %xmm3 |
591 | movdqa %xmm3, %xmm1 |
592 | palignr $5, %xmm2, %xmm3 |
593 | palignr $5, %xmm4, %xmm2 |
594 | lea 32(%edi), %edi |
595 | movdqa %xmm2, -32(%edx, %edi) |
596 | movdqa %xmm3, -16(%edx, %edi) |
597 | |
598 | jae L(shl_5_loop) |
599 | |
600 | L(shl_5_end): |
601 | add $32, %ecx |
602 | add %ecx, %edi |
603 | add %edi, %edx |
604 | lea 5(%edi, %eax), %eax |
605 | POP (%edi) |
606 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
607 | |
608 | cfi_restore_state |
609 | cfi_remember_state |
610 | ALIGN (4) |
611 | L(shl_6): |
612 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
613 | sub $6, %eax |
614 | movaps (%eax), %xmm1 |
615 | xor %edi, %edi |
616 | sub $32, %ecx |
617 | movdqu %xmm0, (%esi) |
618 | POP (%esi) |
619 | L(shl_6_loop): |
620 | |
621 | movdqa 16(%eax, %edi), %xmm2 |
622 | sub $32, %ecx |
623 | movdqa 32(%eax, %edi), %xmm3 |
624 | movdqa %xmm3, %xmm4 |
625 | palignr $6, %xmm2, %xmm3 |
626 | palignr $6, %xmm1, %xmm2 |
627 | lea 32(%edi), %edi |
628 | movdqa %xmm2, -32(%edx, %edi) |
629 | movdqa %xmm3, -16(%edx, %edi) |
630 | |
631 | jb L(shl_6_end) |
632 | |
633 | movdqa 16(%eax, %edi), %xmm2 |
634 | sub $32, %ecx |
635 | movdqa 32(%eax, %edi), %xmm3 |
636 | movdqa %xmm3, %xmm1 |
637 | palignr $6, %xmm2, %xmm3 |
638 | palignr $6, %xmm4, %xmm2 |
639 | lea 32(%edi), %edi |
640 | movdqa %xmm2, -32(%edx, %edi) |
641 | movdqa %xmm3, -16(%edx, %edi) |
642 | |
643 | jae L(shl_6_loop) |
644 | |
645 | L(shl_6_end): |
646 | add $32, %ecx |
647 | add %ecx, %edi |
648 | add %edi, %edx |
649 | lea 6(%edi, %eax), %eax |
650 | POP (%edi) |
651 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
652 | |
653 | cfi_restore_state |
654 | cfi_remember_state |
655 | ALIGN (4) |
656 | L(shl_7): |
657 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
658 | sub $7, %eax |
659 | movaps (%eax), %xmm1 |
660 | xor %edi, %edi |
661 | sub $32, %ecx |
662 | movdqu %xmm0, (%esi) |
663 | POP (%esi) |
664 | L(shl_7_loop): |
665 | |
666 | movdqa 16(%eax, %edi), %xmm2 |
667 | sub $32, %ecx |
668 | movdqa 32(%eax, %edi), %xmm3 |
669 | movdqa %xmm3, %xmm4 |
670 | palignr $7, %xmm2, %xmm3 |
671 | palignr $7, %xmm1, %xmm2 |
672 | lea 32(%edi), %edi |
673 | movdqa %xmm2, -32(%edx, %edi) |
674 | movdqa %xmm3, -16(%edx, %edi) |
675 | |
676 | jb L(shl_7_end) |
677 | |
678 | movdqa 16(%eax, %edi), %xmm2 |
679 | sub $32, %ecx |
680 | movdqa 32(%eax, %edi), %xmm3 |
681 | movdqa %xmm3, %xmm1 |
682 | palignr $7, %xmm2, %xmm3 |
683 | palignr $7, %xmm4, %xmm2 |
684 | lea 32(%edi), %edi |
685 | movdqa %xmm2, -32(%edx, %edi) |
686 | movdqa %xmm3, -16(%edx, %edi) |
687 | |
688 | jae L(shl_7_loop) |
689 | |
690 | L(shl_7_end): |
691 | add $32, %ecx |
692 | add %ecx, %edi |
693 | add %edi, %edx |
694 | lea 7(%edi, %eax), %eax |
695 | POP (%edi) |
696 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
697 | |
698 | cfi_restore_state |
699 | cfi_remember_state |
700 | ALIGN (4) |
701 | L(shl_8): |
702 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
703 | sub $8, %eax |
704 | movaps (%eax), %xmm1 |
705 | xor %edi, %edi |
706 | sub $32, %ecx |
707 | movdqu %xmm0, (%esi) |
708 | POP (%esi) |
709 | L(shl_8_loop): |
710 | |
711 | movdqa 16(%eax, %edi), %xmm2 |
712 | sub $32, %ecx |
713 | movdqa 32(%eax, %edi), %xmm3 |
714 | movdqa %xmm3, %xmm4 |
715 | palignr $8, %xmm2, %xmm3 |
716 | palignr $8, %xmm1, %xmm2 |
717 | lea 32(%edi), %edi |
718 | movdqa %xmm2, -32(%edx, %edi) |
719 | movdqa %xmm3, -16(%edx, %edi) |
720 | |
721 | jb L(shl_8_end) |
722 | |
723 | movdqa 16(%eax, %edi), %xmm2 |
724 | sub $32, %ecx |
725 | movdqa 32(%eax, %edi), %xmm3 |
726 | movdqa %xmm3, %xmm1 |
727 | palignr $8, %xmm2, %xmm3 |
728 | palignr $8, %xmm4, %xmm2 |
729 | lea 32(%edi), %edi |
730 | movdqa %xmm2, -32(%edx, %edi) |
731 | movdqa %xmm3, -16(%edx, %edi) |
732 | |
733 | jae L(shl_8_loop) |
734 | |
735 | L(shl_8_end): |
736 | add $32, %ecx |
737 | add %ecx, %edi |
738 | add %edi, %edx |
739 | lea 8(%edi, %eax), %eax |
740 | POP (%edi) |
741 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
742 | |
743 | cfi_restore_state |
744 | cfi_remember_state |
745 | ALIGN (4) |
746 | L(shl_9): |
747 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
748 | sub $9, %eax |
749 | movaps (%eax), %xmm1 |
750 | xor %edi, %edi |
751 | sub $32, %ecx |
752 | movdqu %xmm0, (%esi) |
753 | POP (%esi) |
754 | L(shl_9_loop): |
755 | |
756 | movdqa 16(%eax, %edi), %xmm2 |
757 | sub $32, %ecx |
758 | movdqa 32(%eax, %edi), %xmm3 |
759 | movdqa %xmm3, %xmm4 |
760 | palignr $9, %xmm2, %xmm3 |
761 | palignr $9, %xmm1, %xmm2 |
762 | lea 32(%edi), %edi |
763 | movdqa %xmm2, -32(%edx, %edi) |
764 | movdqa %xmm3, -16(%edx, %edi) |
765 | |
766 | jb L(shl_9_end) |
767 | |
768 | movdqa 16(%eax, %edi), %xmm2 |
769 | sub $32, %ecx |
770 | movdqa 32(%eax, %edi), %xmm3 |
771 | movdqa %xmm3, %xmm1 |
772 | palignr $9, %xmm2, %xmm3 |
773 | palignr $9, %xmm4, %xmm2 |
774 | lea 32(%edi), %edi |
775 | movdqa %xmm2, -32(%edx, %edi) |
776 | movdqa %xmm3, -16(%edx, %edi) |
777 | |
778 | jae L(shl_9_loop) |
779 | |
780 | L(shl_9_end): |
781 | add $32, %ecx |
782 | add %ecx, %edi |
783 | add %edi, %edx |
784 | lea 9(%edi, %eax), %eax |
785 | POP (%edi) |
786 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
787 | |
788 | cfi_restore_state |
789 | cfi_remember_state |
790 | ALIGN (4) |
791 | L(shl_10): |
792 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
793 | sub $10, %eax |
794 | movaps (%eax), %xmm1 |
795 | xor %edi, %edi |
796 | sub $32, %ecx |
797 | movdqu %xmm0, (%esi) |
798 | POP (%esi) |
799 | L(shl_10_loop): |
800 | |
801 | movdqa 16(%eax, %edi), %xmm2 |
802 | sub $32, %ecx |
803 | movdqa 32(%eax, %edi), %xmm3 |
804 | movdqa %xmm3, %xmm4 |
805 | palignr $10, %xmm2, %xmm3 |
806 | palignr $10, %xmm1, %xmm2 |
807 | lea 32(%edi), %edi |
808 | movdqa %xmm2, -32(%edx, %edi) |
809 | movdqa %xmm3, -16(%edx, %edi) |
810 | |
811 | jb L(shl_10_end) |
812 | |
813 | movdqa 16(%eax, %edi), %xmm2 |
814 | sub $32, %ecx |
815 | movdqa 32(%eax, %edi), %xmm3 |
816 | movdqa %xmm3, %xmm1 |
817 | palignr $10, %xmm2, %xmm3 |
818 | palignr $10, %xmm4, %xmm2 |
819 | lea 32(%edi), %edi |
820 | movdqa %xmm2, -32(%edx, %edi) |
821 | movdqa %xmm3, -16(%edx, %edi) |
822 | |
823 | jae L(shl_10_loop) |
824 | |
825 | L(shl_10_end): |
826 | add $32, %ecx |
827 | add %ecx, %edi |
828 | add %edi, %edx |
829 | lea 10(%edi, %eax), %eax |
830 | POP (%edi) |
831 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
832 | |
833 | cfi_restore_state |
834 | cfi_remember_state |
835 | ALIGN (4) |
836 | L(shl_11): |
837 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
838 | sub $11, %eax |
839 | movaps (%eax), %xmm1 |
840 | xor %edi, %edi |
841 | sub $32, %ecx |
842 | movdqu %xmm0, (%esi) |
843 | POP (%esi) |
844 | L(shl_11_loop): |
845 | |
846 | movdqa 16(%eax, %edi), %xmm2 |
847 | sub $32, %ecx |
848 | movdqa 32(%eax, %edi), %xmm3 |
849 | movdqa %xmm3, %xmm4 |
850 | palignr $11, %xmm2, %xmm3 |
851 | palignr $11, %xmm1, %xmm2 |
852 | lea 32(%edi), %edi |
853 | movdqa %xmm2, -32(%edx, %edi) |
854 | movdqa %xmm3, -16(%edx, %edi) |
855 | |
856 | jb L(shl_11_end) |
857 | |
858 | movdqa 16(%eax, %edi), %xmm2 |
859 | sub $32, %ecx |
860 | movdqa 32(%eax, %edi), %xmm3 |
861 | movdqa %xmm3, %xmm1 |
862 | palignr $11, %xmm2, %xmm3 |
863 | palignr $11, %xmm4, %xmm2 |
864 | lea 32(%edi), %edi |
865 | movdqa %xmm2, -32(%edx, %edi) |
866 | movdqa %xmm3, -16(%edx, %edi) |
867 | |
868 | jae L(shl_11_loop) |
869 | |
870 | L(shl_11_end): |
871 | add $32, %ecx |
872 | add %ecx, %edi |
873 | add %edi, %edx |
874 | lea 11(%edi, %eax), %eax |
875 | POP (%edi) |
876 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
877 | |
878 | cfi_restore_state |
879 | cfi_remember_state |
880 | ALIGN (4) |
881 | L(shl_12): |
882 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
883 | sub $12, %eax |
884 | movaps (%eax), %xmm1 |
885 | xor %edi, %edi |
886 | sub $32, %ecx |
887 | movdqu %xmm0, (%esi) |
888 | POP (%esi) |
889 | L(shl_12_loop): |
890 | |
891 | movdqa 16(%eax, %edi), %xmm2 |
892 | sub $32, %ecx |
893 | movdqa 32(%eax, %edi), %xmm3 |
894 | movdqa %xmm3, %xmm4 |
895 | palignr $12, %xmm2, %xmm3 |
896 | palignr $12, %xmm1, %xmm2 |
897 | lea 32(%edi), %edi |
898 | movdqa %xmm2, -32(%edx, %edi) |
899 | movdqa %xmm3, -16(%edx, %edi) |
900 | |
901 | jb L(shl_12_end) |
902 | |
903 | movdqa 16(%eax, %edi), %xmm2 |
904 | sub $32, %ecx |
905 | movdqa 32(%eax, %edi), %xmm3 |
906 | movdqa %xmm3, %xmm1 |
907 | palignr $12, %xmm2, %xmm3 |
908 | palignr $12, %xmm4, %xmm2 |
909 | lea 32(%edi), %edi |
910 | movdqa %xmm2, -32(%edx, %edi) |
911 | movdqa %xmm3, -16(%edx, %edi) |
912 | |
913 | jae L(shl_12_loop) |
914 | |
915 | L(shl_12_end): |
916 | add $32, %ecx |
917 | add %ecx, %edi |
918 | add %edi, %edx |
919 | lea 12(%edi, %eax), %eax |
920 | POP (%edi) |
921 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
922 | |
923 | cfi_restore_state |
924 | cfi_remember_state |
925 | ALIGN (4) |
926 | L(shl_13): |
927 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
928 | sub $13, %eax |
929 | movaps (%eax), %xmm1 |
930 | xor %edi, %edi |
931 | sub $32, %ecx |
932 | movdqu %xmm0, (%esi) |
933 | POP (%esi) |
934 | L(shl_13_loop): |
935 | |
936 | movdqa 16(%eax, %edi), %xmm2 |
937 | sub $32, %ecx |
938 | movdqa 32(%eax, %edi), %xmm3 |
939 | movdqa %xmm3, %xmm4 |
940 | palignr $13, %xmm2, %xmm3 |
941 | palignr $13, %xmm1, %xmm2 |
942 | lea 32(%edi), %edi |
943 | movdqa %xmm2, -32(%edx, %edi) |
944 | movdqa %xmm3, -16(%edx, %edi) |
945 | |
946 | jb L(shl_13_end) |
947 | |
948 | movdqa 16(%eax, %edi), %xmm2 |
949 | sub $32, %ecx |
950 | movdqa 32(%eax, %edi), %xmm3 |
951 | movdqa %xmm3, %xmm1 |
952 | palignr $13, %xmm2, %xmm3 |
953 | palignr $13, %xmm4, %xmm2 |
954 | lea 32(%edi), %edi |
955 | movdqa %xmm2, -32(%edx, %edi) |
956 | movdqa %xmm3, -16(%edx, %edi) |
957 | |
958 | jae L(shl_13_loop) |
959 | |
960 | L(shl_13_end): |
961 | add $32, %ecx |
962 | add %ecx, %edi |
963 | add %edi, %edx |
964 | lea 13(%edi, %eax), %eax |
965 | POP (%edi) |
966 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
967 | |
968 | cfi_restore_state |
969 | cfi_remember_state |
970 | ALIGN (4) |
971 | L(shl_14): |
972 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
973 | sub $14, %eax |
974 | movaps (%eax), %xmm1 |
975 | xor %edi, %edi |
976 | sub $32, %ecx |
977 | movdqu %xmm0, (%esi) |
978 | POP (%esi) |
979 | L(shl_14_loop): |
980 | |
981 | movdqa 16(%eax, %edi), %xmm2 |
982 | sub $32, %ecx |
983 | movdqa 32(%eax, %edi), %xmm3 |
984 | movdqa %xmm3, %xmm4 |
985 | palignr $14, %xmm2, %xmm3 |
986 | palignr $14, %xmm1, %xmm2 |
987 | lea 32(%edi), %edi |
988 | movdqa %xmm2, -32(%edx, %edi) |
989 | movdqa %xmm3, -16(%edx, %edi) |
990 | |
991 | jb L(shl_14_end) |
992 | |
993 | movdqa 16(%eax, %edi), %xmm2 |
994 | sub $32, %ecx |
995 | movdqa 32(%eax, %edi), %xmm3 |
996 | movdqa %xmm3, %xmm1 |
997 | palignr $14, %xmm2, %xmm3 |
998 | palignr $14, %xmm4, %xmm2 |
999 | lea 32(%edi), %edi |
1000 | movdqa %xmm2, -32(%edx, %edi) |
1001 | movdqa %xmm3, -16(%edx, %edi) |
1002 | |
1003 | jae L(shl_14_loop) |
1004 | |
1005 | L(shl_14_end): |
1006 | add $32, %ecx |
1007 | add %ecx, %edi |
1008 | add %edi, %edx |
1009 | lea 14(%edi, %eax), %eax |
1010 | POP (%edi) |
1011 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
1012 | |
1013 | cfi_restore_state |
1014 | cfi_remember_state |
1015 | ALIGN (4) |
1016 | L(shl_15): |
1017 | BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) |
1018 | sub $15, %eax |
1019 | movaps (%eax), %xmm1 |
1020 | xor %edi, %edi |
1021 | sub $32, %ecx |
1022 | movdqu %xmm0, (%esi) |
1023 | POP (%esi) |
1024 | L(shl_15_loop): |
1025 | |
1026 | movdqa 16(%eax, %edi), %xmm2 |
1027 | sub $32, %ecx |
1028 | movdqa 32(%eax, %edi), %xmm3 |
1029 | movdqa %xmm3, %xmm4 |
1030 | palignr $15, %xmm2, %xmm3 |
1031 | palignr $15, %xmm1, %xmm2 |
1032 | lea 32(%edi), %edi |
1033 | movdqa %xmm2, -32(%edx, %edi) |
1034 | movdqa %xmm3, -16(%edx, %edi) |
1035 | |
1036 | jb L(shl_15_end) |
1037 | |
1038 | movdqa 16(%eax, %edi), %xmm2 |
1039 | sub $32, %ecx |
1040 | movdqa 32(%eax, %edi), %xmm3 |
1041 | movdqa %xmm3, %xmm1 |
1042 | palignr $15, %xmm2, %xmm3 |
1043 | palignr $15, %xmm4, %xmm2 |
1044 | lea 32(%edi), %edi |
1045 | movdqa %xmm2, -32(%edx, %edi) |
1046 | movdqa %xmm3, -16(%edx, %edi) |
1047 | |
1048 | jae L(shl_15_loop) |
1049 | |
1050 | L(shl_15_end): |
1051 | add $32, %ecx |
1052 | add %ecx, %edi |
1053 | add %edi, %edx |
1054 | lea 15(%edi, %eax), %eax |
1055 | POP (%edi) |
1056 | BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) |
1057 | |
1058 | |
1059 | ALIGN (4) |
1060 | L(fwd_write_44bytes): |
1061 | movl -44(%eax), %ecx |
1062 | movl %ecx, -44(%edx) |
1063 | L(fwd_write_40bytes): |
1064 | movl -40(%eax), %ecx |
1065 | movl %ecx, -40(%edx) |
1066 | L(fwd_write_36bytes): |
1067 | movl -36(%eax), %ecx |
1068 | movl %ecx, -36(%edx) |
1069 | L(fwd_write_32bytes): |
1070 | movl -32(%eax), %ecx |
1071 | movl %ecx, -32(%edx) |
1072 | L(fwd_write_28bytes): |
1073 | movl -28(%eax), %ecx |
1074 | movl %ecx, -28(%edx) |
1075 | L(fwd_write_24bytes): |
1076 | movl -24(%eax), %ecx |
1077 | movl %ecx, -24(%edx) |
1078 | L(fwd_write_20bytes): |
1079 | movl -20(%eax), %ecx |
1080 | movl %ecx, -20(%edx) |
1081 | L(fwd_write_16bytes): |
1082 | movl -16(%eax), %ecx |
1083 | movl %ecx, -16(%edx) |
1084 | L(fwd_write_12bytes): |
1085 | movl -12(%eax), %ecx |
1086 | movl %ecx, -12(%edx) |
1087 | L(fwd_write_8bytes): |
1088 | movl -8(%eax), %ecx |
1089 | movl %ecx, -8(%edx) |
1090 | L(fwd_write_4bytes): |
1091 | movl -4(%eax), %ecx |
1092 | movl %ecx, -4(%edx) |
1093 | L(fwd_write_0bytes): |
1094 | #ifdef USE_AS_MEMPCPY |
1095 | movl %edx, %eax |
1096 | #else |
1097 | movl DEST(%esp), %eax |
1098 | #endif |
1099 | RETURN |
1100 | |
1101 | ALIGN (4) |
1102 | L(fwd_write_5bytes): |
1103 | movl -5(%eax), %ecx |
1104 | movl -4(%eax), %eax |
1105 | movl %ecx, -5(%edx) |
1106 | movl %eax, -4(%edx) |
1107 | #ifdef USE_AS_MEMPCPY |
1108 | movl %edx, %eax |
1109 | #else |
1110 | movl DEST(%esp), %eax |
1111 | #endif |
1112 | RETURN |
1113 | |
1114 | ALIGN (4) |
1115 | L(fwd_write_45bytes): |
1116 | movl -45(%eax), %ecx |
1117 | movl %ecx, -45(%edx) |
1118 | L(fwd_write_41bytes): |
1119 | movl -41(%eax), %ecx |
1120 | movl %ecx, -41(%edx) |
1121 | L(fwd_write_37bytes): |
1122 | movl -37(%eax), %ecx |
1123 | movl %ecx, -37(%edx) |
1124 | L(fwd_write_33bytes): |
1125 | movl -33(%eax), %ecx |
1126 | movl %ecx, -33(%edx) |
1127 | L(fwd_write_29bytes): |
1128 | movl -29(%eax), %ecx |
1129 | movl %ecx, -29(%edx) |
1130 | L(fwd_write_25bytes): |
1131 | movl -25(%eax), %ecx |
1132 | movl %ecx, -25(%edx) |
1133 | L(fwd_write_21bytes): |
1134 | movl -21(%eax), %ecx |
1135 | movl %ecx, -21(%edx) |
1136 | L(fwd_write_17bytes): |
1137 | movl -17(%eax), %ecx |
1138 | movl %ecx, -17(%edx) |
1139 | L(fwd_write_13bytes): |
1140 | movl -13(%eax), %ecx |
1141 | movl %ecx, -13(%edx) |
1142 | L(fwd_write_9bytes): |
1143 | movl -9(%eax), %ecx |
1144 | movl %ecx, -9(%edx) |
1145 | movl -5(%eax), %ecx |
1146 | movl %ecx, -5(%edx) |
1147 | L(fwd_write_1bytes): |
1148 | movzbl -1(%eax), %ecx |
1149 | movb %cl, -1(%edx) |
1150 | #ifdef USE_AS_MEMPCPY |
1151 | movl %edx, %eax |
1152 | #else |
1153 | movl DEST(%esp), %eax |
1154 | #endif |
1155 | RETURN |
1156 | |
1157 | ALIGN (4) |
1158 | L(fwd_write_46bytes): |
1159 | movl -46(%eax), %ecx |
1160 | movl %ecx, -46(%edx) |
1161 | L(fwd_write_42bytes): |
1162 | movl -42(%eax), %ecx |
1163 | movl %ecx, -42(%edx) |
1164 | L(fwd_write_38bytes): |
1165 | movl -38(%eax), %ecx |
1166 | movl %ecx, -38(%edx) |
1167 | L(fwd_write_34bytes): |
1168 | movl -34(%eax), %ecx |
1169 | movl %ecx, -34(%edx) |
1170 | L(fwd_write_30bytes): |
1171 | movl -30(%eax), %ecx |
1172 | movl %ecx, -30(%edx) |
1173 | L(fwd_write_26bytes): |
1174 | movl -26(%eax), %ecx |
1175 | movl %ecx, -26(%edx) |
1176 | L(fwd_write_22bytes): |
1177 | movl -22(%eax), %ecx |
1178 | movl %ecx, -22(%edx) |
1179 | L(fwd_write_18bytes): |
1180 | movl -18(%eax), %ecx |
1181 | movl %ecx, -18(%edx) |
1182 | L(fwd_write_14bytes): |
1183 | movl -14(%eax), %ecx |
1184 | movl %ecx, -14(%edx) |
1185 | L(fwd_write_10bytes): |
1186 | movl -10(%eax), %ecx |
1187 | movl %ecx, -10(%edx) |
1188 | L(fwd_write_6bytes): |
1189 | movl -6(%eax), %ecx |
1190 | movl %ecx, -6(%edx) |
1191 | L(fwd_write_2bytes): |
1192 | movzwl -2(%eax), %ecx |
1193 | movw %cx, -2(%edx) |
1194 | #ifdef USE_AS_MEMPCPY |
1195 | movl %edx, %eax |
1196 | #else |
1197 | movl DEST(%esp), %eax |
1198 | #endif |
1199 | RETURN |
1200 | |
1201 | ALIGN (4) |
1202 | L(fwd_write_47bytes): |
1203 | movl -47(%eax), %ecx |
1204 | movl %ecx, -47(%edx) |
1205 | L(fwd_write_43bytes): |
1206 | movl -43(%eax), %ecx |
1207 | movl %ecx, -43(%edx) |
1208 | L(fwd_write_39bytes): |
1209 | movl -39(%eax), %ecx |
1210 | movl %ecx, -39(%edx) |
1211 | L(fwd_write_35bytes): |
1212 | movl -35(%eax), %ecx |
1213 | movl %ecx, -35(%edx) |
1214 | L(fwd_write_31bytes): |
1215 | movl -31(%eax), %ecx |
1216 | movl %ecx, -31(%edx) |
1217 | L(fwd_write_27bytes): |
1218 | movl -27(%eax), %ecx |
1219 | movl %ecx, -27(%edx) |
1220 | L(fwd_write_23bytes): |
1221 | movl -23(%eax), %ecx |
1222 | movl %ecx, -23(%edx) |
1223 | L(fwd_write_19bytes): |
1224 | movl -19(%eax), %ecx |
1225 | movl %ecx, -19(%edx) |
1226 | L(fwd_write_15bytes): |
1227 | movl -15(%eax), %ecx |
1228 | movl %ecx, -15(%edx) |
1229 | L(fwd_write_11bytes): |
1230 | movl -11(%eax), %ecx |
1231 | movl %ecx, -11(%edx) |
1232 | L(fwd_write_7bytes): |
1233 | movl -7(%eax), %ecx |
1234 | movl %ecx, -7(%edx) |
1235 | L(fwd_write_3bytes): |
1236 | movzwl -3(%eax), %ecx |
1237 | movzbl -1(%eax), %eax |
1238 | movw %cx, -3(%edx) |
1239 | movb %al, -1(%edx) |
1240 | #ifdef USE_AS_MEMPCPY |
1241 | movl %edx, %eax |
1242 | #else |
1243 | movl DEST(%esp), %eax |
1244 | #endif |
1245 | RETURN_END |
1246 | |
1247 | cfi_restore_state |
1248 | cfi_remember_state |
1249 | ALIGN (4) |
1250 | L(large_page): |
1251 | movdqu (%eax), %xmm1 |
1252 | movdqu %xmm0, (%esi) |
1253 | movntdq %xmm1, (%edx) |
1254 | add $0x10, %eax |
1255 | add $0x10, %edx |
1256 | sub $0x10, %ecx |
1257 | cmp %al, %dl |
1258 | je L(copy_page_by_rep) |
1259 | L(large_page_loop_init): |
1260 | POP (%esi) |
1261 | sub $0x80, %ecx |
1262 | POP (%edi) |
1263 | L(large_page_loop): |
1264 | prefetchnta 0x1c0(%eax) |
1265 | prefetchnta 0x280(%eax) |
1266 | movdqu (%eax), %xmm0 |
1267 | movdqu 0x10(%eax), %xmm1 |
1268 | movdqu 0x20(%eax), %xmm2 |
1269 | movdqu 0x30(%eax), %xmm3 |
1270 | movdqu 0x40(%eax), %xmm4 |
1271 | movdqu 0x50(%eax), %xmm5 |
1272 | movdqu 0x60(%eax), %xmm6 |
1273 | movdqu 0x70(%eax), %xmm7 |
1274 | lea 0x80(%eax), %eax |
1275 | lfence |
1276 | sub $0x80, %ecx |
1277 | movntdq %xmm0, (%edx) |
1278 | movntdq %xmm1, 0x10(%edx) |
1279 | movntdq %xmm2, 0x20(%edx) |
1280 | movntdq %xmm3, 0x30(%edx) |
1281 | movntdq %xmm4, 0x40(%edx) |
1282 | movntdq %xmm5, 0x50(%edx) |
1283 | movntdq %xmm6, 0x60(%edx) |
1284 | movntdq %xmm7, 0x70(%edx) |
1285 | lea 0x80(%edx), %edx |
1286 | jae L(large_page_loop) |
1287 | add $0x80, %ecx |
1288 | cmp $0x40, %ecx |
1289 | jb L(large_page_less_64bytes) |
1290 | |
1291 | movdqu (%eax), %xmm0 |
1292 | movdqu 0x10(%eax), %xmm1 |
1293 | movdqu 0x20(%eax), %xmm2 |
1294 | movdqu 0x30(%eax), %xmm3 |
1295 | lea 0x40(%eax), %eax |
1296 | |
1297 | movntdq %xmm0, (%edx) |
1298 | movntdq %xmm1, 0x10(%edx) |
1299 | movntdq %xmm2, 0x20(%edx) |
1300 | movntdq %xmm3, 0x30(%edx) |
1301 | lea 0x40(%edx), %edx |
1302 | sub $0x40, %ecx |
1303 | L(large_page_less_64bytes): |
1304 | cmp $32, %ecx |
1305 | jb L(large_page_less_32bytes) |
1306 | movdqu (%eax), %xmm0 |
1307 | movdqu 0x10(%eax), %xmm1 |
1308 | lea 0x20(%eax), %eax |
1309 | movntdq %xmm0, (%edx) |
1310 | movntdq %xmm1, 0x10(%edx) |
1311 | lea 0x20(%edx), %edx |
1312 | sub $0x20, %ecx |
1313 | L(large_page_less_32bytes): |
1314 | add %ecx, %edx |
1315 | add %ecx, %eax |
1316 | sfence |
1317 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
1318 | |
1319 | cfi_restore_state |
1320 | cfi_remember_state |
1321 | ALIGN (4) |
1322 | L(copy_page_by_rep): |
1323 | mov %eax, %esi |
1324 | mov %edx, %edi |
1325 | mov %ecx, %edx |
1326 | shr $2, %ecx |
1327 | and $3, %edx |
1328 | rep movsl |
1329 | jz L(copy_page_by_rep_exit) |
1330 | cmp $2, %edx |
1331 | jb L(copy_page_by_rep_left_1) |
1332 | movzwl (%esi), %eax |
1333 | movw %ax, (%edi) |
1334 | add $2, %esi |
1335 | add $2, %edi |
1336 | sub $2, %edx |
1337 | jz L(copy_page_by_rep_exit) |
1338 | L(copy_page_by_rep_left_1): |
1339 | movzbl (%esi), %eax |
1340 | movb %al, (%edi) |
1341 | L(copy_page_by_rep_exit): |
1342 | POP (%esi) |
1343 | POP (%edi) |
1344 | movl DEST(%esp), %eax |
1345 | #ifdef USE_AS_MEMPCPY |
1346 | movl LEN(%esp), %ecx |
1347 | add %ecx, %eax |
1348 | #endif |
1349 | RETURN |
1350 | |
1351 | ALIGN (4) |
1352 | L(bk_write_44bytes): |
1353 | movl 40(%eax), %ecx |
1354 | movl %ecx, 40(%edx) |
1355 | L(bk_write_40bytes): |
1356 | movl 36(%eax), %ecx |
1357 | movl %ecx, 36(%edx) |
1358 | L(bk_write_36bytes): |
1359 | movl 32(%eax), %ecx |
1360 | movl %ecx, 32(%edx) |
1361 | L(bk_write_32bytes): |
1362 | movl 28(%eax), %ecx |
1363 | movl %ecx, 28(%edx) |
1364 | L(bk_write_28bytes): |
1365 | movl 24(%eax), %ecx |
1366 | movl %ecx, 24(%edx) |
1367 | L(bk_write_24bytes): |
1368 | movl 20(%eax), %ecx |
1369 | movl %ecx, 20(%edx) |
1370 | L(bk_write_20bytes): |
1371 | movl 16(%eax), %ecx |
1372 | movl %ecx, 16(%edx) |
1373 | L(bk_write_16bytes): |
1374 | movl 12(%eax), %ecx |
1375 | movl %ecx, 12(%edx) |
1376 | L(bk_write_12bytes): |
1377 | movl 8(%eax), %ecx |
1378 | movl %ecx, 8(%edx) |
1379 | L(bk_write_8bytes): |
1380 | movl 4(%eax), %ecx |
1381 | movl %ecx, 4(%edx) |
1382 | L(bk_write_4bytes): |
1383 | movl (%eax), %ecx |
1384 | movl %ecx, (%edx) |
1385 | L(bk_write_0bytes): |
1386 | movl DEST(%esp), %eax |
1387 | #ifdef USE_AS_MEMPCPY |
1388 | movl LEN(%esp), %ecx |
1389 | add %ecx, %eax |
1390 | #endif |
1391 | RETURN |
1392 | |
1393 | ALIGN (4) |
1394 | L(bk_write_45bytes): |
1395 | movl 41(%eax), %ecx |
1396 | movl %ecx, 41(%edx) |
1397 | L(bk_write_41bytes): |
1398 | movl 37(%eax), %ecx |
1399 | movl %ecx, 37(%edx) |
1400 | L(bk_write_37bytes): |
1401 | movl 33(%eax), %ecx |
1402 | movl %ecx, 33(%edx) |
1403 | L(bk_write_33bytes): |
1404 | movl 29(%eax), %ecx |
1405 | movl %ecx, 29(%edx) |
1406 | L(bk_write_29bytes): |
1407 | movl 25(%eax), %ecx |
1408 | movl %ecx, 25(%edx) |
1409 | L(bk_write_25bytes): |
1410 | movl 21(%eax), %ecx |
1411 | movl %ecx, 21(%edx) |
1412 | L(bk_write_21bytes): |
1413 | movl 17(%eax), %ecx |
1414 | movl %ecx, 17(%edx) |
1415 | L(bk_write_17bytes): |
1416 | movl 13(%eax), %ecx |
1417 | movl %ecx, 13(%edx) |
1418 | L(bk_write_13bytes): |
1419 | movl 9(%eax), %ecx |
1420 | movl %ecx, 9(%edx) |
1421 | L(bk_write_9bytes): |
1422 | movl 5(%eax), %ecx |
1423 | movl %ecx, 5(%edx) |
1424 | L(bk_write_5bytes): |
1425 | movl 1(%eax), %ecx |
1426 | movl %ecx, 1(%edx) |
1427 | L(bk_write_1bytes): |
1428 | movzbl (%eax), %ecx |
1429 | movb %cl, (%edx) |
1430 | movl DEST(%esp), %eax |
1431 | #ifdef USE_AS_MEMPCPY |
1432 | movl LEN(%esp), %ecx |
1433 | add %ecx, %eax |
1434 | #endif |
1435 | RETURN |
1436 | |
1437 | ALIGN (4) |
1438 | L(bk_write_46bytes): |
1439 | movl 42(%eax), %ecx |
1440 | movl %ecx, 42(%edx) |
1441 | L(bk_write_42bytes): |
1442 | movl 38(%eax), %ecx |
1443 | movl %ecx, 38(%edx) |
1444 | L(bk_write_38bytes): |
1445 | movl 34(%eax), %ecx |
1446 | movl %ecx, 34(%edx) |
1447 | L(bk_write_34bytes): |
1448 | movl 30(%eax), %ecx |
1449 | movl %ecx, 30(%edx) |
1450 | L(bk_write_30bytes): |
1451 | movl 26(%eax), %ecx |
1452 | movl %ecx, 26(%edx) |
1453 | L(bk_write_26bytes): |
1454 | movl 22(%eax), %ecx |
1455 | movl %ecx, 22(%edx) |
1456 | L(bk_write_22bytes): |
1457 | movl 18(%eax), %ecx |
1458 | movl %ecx, 18(%edx) |
1459 | L(bk_write_18bytes): |
1460 | movl 14(%eax), %ecx |
1461 | movl %ecx, 14(%edx) |
1462 | L(bk_write_14bytes): |
1463 | movl 10(%eax), %ecx |
1464 | movl %ecx, 10(%edx) |
1465 | L(bk_write_10bytes): |
1466 | movl 6(%eax), %ecx |
1467 | movl %ecx, 6(%edx) |
1468 | L(bk_write_6bytes): |
1469 | movl 2(%eax), %ecx |
1470 | movl %ecx, 2(%edx) |
1471 | L(bk_write_2bytes): |
1472 | movzwl (%eax), %ecx |
1473 | movw %cx, (%edx) |
1474 | movl DEST(%esp), %eax |
1475 | #ifdef USE_AS_MEMPCPY |
1476 | movl LEN(%esp), %ecx |
1477 | add %ecx, %eax |
1478 | #endif |
1479 | RETURN |
1480 | |
1481 | ALIGN (4) |
1482 | L(bk_write_47bytes): |
1483 | movl 43(%eax), %ecx |
1484 | movl %ecx, 43(%edx) |
1485 | L(bk_write_43bytes): |
1486 | movl 39(%eax), %ecx |
1487 | movl %ecx, 39(%edx) |
1488 | L(bk_write_39bytes): |
1489 | movl 35(%eax), %ecx |
1490 | movl %ecx, 35(%edx) |
1491 | L(bk_write_35bytes): |
1492 | movl 31(%eax), %ecx |
1493 | movl %ecx, 31(%edx) |
1494 | L(bk_write_31bytes): |
1495 | movl 27(%eax), %ecx |
1496 | movl %ecx, 27(%edx) |
1497 | L(bk_write_27bytes): |
1498 | movl 23(%eax), %ecx |
1499 | movl %ecx, 23(%edx) |
1500 | L(bk_write_23bytes): |
1501 | movl 19(%eax), %ecx |
1502 | movl %ecx, 19(%edx) |
1503 | L(bk_write_19bytes): |
1504 | movl 15(%eax), %ecx |
1505 | movl %ecx, 15(%edx) |
1506 | L(bk_write_15bytes): |
1507 | movl 11(%eax), %ecx |
1508 | movl %ecx, 11(%edx) |
1509 | L(bk_write_11bytes): |
1510 | movl 7(%eax), %ecx |
1511 | movl %ecx, 7(%edx) |
1512 | L(bk_write_7bytes): |
1513 | movl 3(%eax), %ecx |
1514 | movl %ecx, 3(%edx) |
1515 | L(bk_write_3bytes): |
1516 | movzwl 1(%eax), %ecx |
1517 | movw %cx, 1(%edx) |
1518 | movzbl (%eax), %eax |
1519 | movb %al, (%edx) |
1520 | movl DEST(%esp), %eax |
1521 | #ifdef USE_AS_MEMPCPY |
1522 | movl LEN(%esp), %ecx |
1523 | add %ecx, %eax |
1524 | #endif |
1525 | RETURN_END |
1526 | |
1527 | |
1528 | .pushsection .rodata.ssse3,"a" ,@progbits |
1529 | ALIGN (2) |
1530 | L(table_48bytes_fwd): |
1531 | .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) |
1532 | .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) |
1533 | .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) |
1534 | .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) |
1535 | .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) |
1536 | .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) |
1537 | .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) |
1538 | .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) |
1539 | .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) |
1540 | .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) |
1541 | .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) |
1542 | .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) |
1543 | .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) |
1544 | .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) |
1545 | .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) |
1546 | .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) |
1547 | .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) |
1548 | .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) |
1549 | .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) |
1550 | .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) |
1551 | .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) |
1552 | .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) |
1553 | .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) |
1554 | .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) |
1555 | .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) |
1556 | .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) |
1557 | .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) |
1558 | .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) |
1559 | .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) |
1560 | .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) |
1561 | .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) |
1562 | .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) |
1563 | .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) |
1564 | .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) |
1565 | .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) |
1566 | .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) |
1567 | .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) |
1568 | .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) |
1569 | .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) |
1570 | .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) |
1571 | .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) |
1572 | .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) |
1573 | .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) |
1574 | .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) |
1575 | .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) |
1576 | .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) |
1577 | .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) |
1578 | .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) |
1579 | |
1580 | ALIGN (2) |
1581 | L(shl_table): |
1582 | .int JMPTBL (L(shl_0), L(shl_table)) |
1583 | .int JMPTBL (L(shl_1), L(shl_table)) |
1584 | .int JMPTBL (L(shl_2), L(shl_table)) |
1585 | .int JMPTBL (L(shl_3), L(shl_table)) |
1586 | .int JMPTBL (L(shl_4), L(shl_table)) |
1587 | .int JMPTBL (L(shl_5), L(shl_table)) |
1588 | .int JMPTBL (L(shl_6), L(shl_table)) |
1589 | .int JMPTBL (L(shl_7), L(shl_table)) |
1590 | .int JMPTBL (L(shl_8), L(shl_table)) |
1591 | .int JMPTBL (L(shl_9), L(shl_table)) |
1592 | .int JMPTBL (L(shl_10), L(shl_table)) |
1593 | .int JMPTBL (L(shl_11), L(shl_table)) |
1594 | .int JMPTBL (L(shl_12), L(shl_table)) |
1595 | .int JMPTBL (L(shl_13), L(shl_table)) |
1596 | .int JMPTBL (L(shl_14), L(shl_table)) |
1597 | .int JMPTBL (L(shl_15), L(shl_table)) |
1598 | |
1599 | ALIGN (2) |
1600 | L(table_48_bytes_bwd): |
1601 | .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) |
1602 | .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) |
1603 | .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) |
1604 | .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) |
1605 | .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) |
1606 | .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) |
1607 | .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) |
1608 | .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) |
1609 | .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) |
1610 | .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) |
1611 | .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) |
1612 | .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) |
1613 | .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) |
1614 | .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) |
1615 | .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) |
1616 | .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) |
1617 | .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) |
1618 | .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) |
1619 | .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) |
1620 | .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) |
1621 | .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) |
1622 | .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) |
1623 | .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) |
1624 | .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) |
1625 | .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) |
1626 | .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) |
1627 | .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) |
1628 | .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) |
1629 | .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) |
1630 | .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) |
1631 | .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) |
1632 | .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) |
1633 | .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) |
1634 | .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) |
1635 | .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) |
1636 | .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) |
1637 | .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) |
1638 | .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) |
1639 | .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) |
1640 | .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) |
1641 | .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) |
1642 | .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) |
1643 | .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) |
1644 | .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) |
1645 | .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) |
1646 | .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) |
1647 | .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) |
1648 | .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) |
1649 | |
1650 | .popsection |
1651 | |
1652 | #ifdef USE_AS_MEMMOVE |
1653 | ALIGN (4) |
1654 | L(copy_backward): |
1655 | PUSH (%esi) |
1656 | movl %eax, %esi |
1657 | add %ecx, %edx |
1658 | add %ecx, %esi |
1659 | testl $0x3, %edx |
1660 | jnz L(bk_align) |
1661 | |
1662 | L(bk_aligned_4): |
1663 | cmp $64, %ecx |
1664 | jae L(bk_write_more64bytes) |
1665 | |
1666 | L(bk_write_64bytesless): |
1667 | cmp $32, %ecx |
1668 | jb L(bk_write_less32bytes) |
1669 | |
1670 | L(bk_write_more32bytes): |
1671 | /* Copy 32 bytes at a time. */ |
1672 | sub $32, %ecx |
1673 | movl -4(%esi), %eax |
1674 | movl %eax, -4(%edx) |
1675 | movl -8(%esi), %eax |
1676 | movl %eax, -8(%edx) |
1677 | movl -12(%esi), %eax |
1678 | movl %eax, -12(%edx) |
1679 | movl -16(%esi), %eax |
1680 | movl %eax, -16(%edx) |
1681 | movl -20(%esi), %eax |
1682 | movl %eax, -20(%edx) |
1683 | movl -24(%esi), %eax |
1684 | movl %eax, -24(%edx) |
1685 | movl -28(%esi), %eax |
1686 | movl %eax, -28(%edx) |
1687 | movl -32(%esi), %eax |
1688 | movl %eax, -32(%edx) |
1689 | sub $32, %edx |
1690 | sub $32, %esi |
1691 | |
1692 | L(bk_write_less32bytes): |
1693 | movl %esi, %eax |
1694 | sub %ecx, %edx |
1695 | sub %ecx, %eax |
1696 | POP (%esi) |
1697 | L(bk_write_less48bytes): |
1698 | BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) |
1699 | |
1700 | CFI_PUSH (%esi) |
1701 | ALIGN (4) |
1702 | L(bk_align): |
1703 | cmp $8, %ecx |
1704 | jbe L(bk_write_less32bytes) |
1705 | testl $1, %edx |
1706 | /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, |
1707 | then (EDX & 2) must be != 0. */ |
1708 | jz L(bk_got2) |
1709 | sub $1, %esi |
1710 | sub $1, %ecx |
1711 | sub $1, %edx |
1712 | movzbl (%esi), %eax |
1713 | movb %al, (%edx) |
1714 | |
1715 | testl $2, %edx |
1716 | jz L(bk_aligned_4) |
1717 | |
1718 | L(bk_got2): |
1719 | sub $2, %esi |
1720 | sub $2, %ecx |
1721 | sub $2, %edx |
1722 | movzwl (%esi), %eax |
1723 | movw %ax, (%edx) |
1724 | jmp L(bk_aligned_4) |
1725 | |
1726 | ALIGN (4) |
1727 | L(bk_write_more64bytes): |
1728 | /* Check alignment of last byte. */ |
1729 | testl $15, %edx |
1730 | jz L(bk_ssse3_cpy_pre) |
1731 | |
1732 | /* EDX is aligned 4 bytes, but not 16 bytes. */ |
1733 | L(bk_ssse3_align): |
1734 | sub $4, %esi |
1735 | sub $4, %ecx |
1736 | sub $4, %edx |
1737 | movl (%esi), %eax |
1738 | movl %eax, (%edx) |
1739 | |
1740 | testl $15, %edx |
1741 | jz L(bk_ssse3_cpy_pre) |
1742 | |
1743 | sub $4, %esi |
1744 | sub $4, %ecx |
1745 | sub $4, %edx |
1746 | movl (%esi), %eax |
1747 | movl %eax, (%edx) |
1748 | |
1749 | testl $15, %edx |
1750 | jz L(bk_ssse3_cpy_pre) |
1751 | |
1752 | sub $4, %esi |
1753 | sub $4, %ecx |
1754 | sub $4, %edx |
1755 | movl (%esi), %eax |
1756 | movl %eax, (%edx) |
1757 | |
1758 | L(bk_ssse3_cpy_pre): |
1759 | cmp $64, %ecx |
1760 | jb L(bk_write_more32bytes) |
1761 | |
1762 | L(bk_ssse3_cpy): |
1763 | sub $64, %esi |
1764 | sub $64, %ecx |
1765 | sub $64, %edx |
1766 | movdqu 0x30(%esi), %xmm3 |
1767 | movdqa %xmm3, 0x30(%edx) |
1768 | movdqu 0x20(%esi), %xmm2 |
1769 | movdqa %xmm2, 0x20(%edx) |
1770 | movdqu 0x10(%esi), %xmm1 |
1771 | movdqa %xmm1, 0x10(%edx) |
1772 | movdqu (%esi), %xmm0 |
1773 | movdqa %xmm0, (%edx) |
1774 | cmp $64, %ecx |
1775 | jae L(bk_ssse3_cpy) |
1776 | jmp L(bk_write_64bytesless) |
1777 | |
1778 | #endif |
1779 | |
1780 | END (MEMCPY) |
1781 | |
1782 | #endif |
1783 | |