1 | /* memcpy with SSSE3 |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) \ |
20 | && (defined SHARED \ |
21 | || defined USE_AS_MEMMOVE \ |
22 | || !defined USE_MULTIARCH) |
23 | |
24 | # include <sysdep.h> |
25 | # include "asm-syntax.h" |
26 | |
27 | # ifndef MEMCPY |
28 | # define MEMCPY __memcpy_ssse3 |
29 | # define MEMCPY_CHK __memcpy_chk_ssse3 |
30 | # endif |
31 | |
32 | # define DEST PARMS |
33 | # define SRC DEST+4 |
34 | # define LEN SRC+4 |
35 | |
36 | # define CFI_PUSH(REG) \ |
37 | cfi_adjust_cfa_offset (4); \ |
38 | cfi_rel_offset (REG, 0) |
39 | |
40 | # define CFI_POP(REG) \ |
41 | cfi_adjust_cfa_offset (-4); \ |
42 | cfi_restore (REG) |
43 | |
44 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
45 | # define POP(REG) popl REG; CFI_POP (REG) |
46 | |
47 | # ifdef PIC |
48 | # define PARMS 8 /* Preserve EBX. */ |
49 | # define ENTRANCE PUSH (%ebx); |
50 | # define RETURN_END POP (%ebx); ret |
51 | # define RETURN RETURN_END; CFI_PUSH (%ebx) |
52 | # define JMPTBL(I, B) I - B |
53 | |
54 | /* Load an entry in a jump table into EBX and branch to it. TABLE is a |
55 | jump table with relative offsets. INDEX is a register contains the |
56 | index into the jump table. SCALE is the scale of INDEX. */ |
57 | |
58 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ |
59 | /* We first load PC into EBX. */ \ |
60 | SETUP_PIC_REG(bx); \ |
61 | /* Get the address of the jump table. */ \ |
62 | addl $(TABLE - .), %ebx; \ |
63 | /* Get the entry and convert the relative offset to the \ |
64 | absolute address. */ \ |
65 | addl (%ebx, INDEX, SCALE), %ebx; \ |
66 | /* We loaded the jump table. Go. */ \ |
67 | jmp *%ebx |
68 | # else |
69 | |
70 | # define PARMS 4 |
71 | # define ENTRANCE |
72 | # define RETURN_END ret |
73 | # define RETURN RETURN_END |
74 | # define JMPTBL(I, B) I |
75 | |
76 | /* Branch to an entry in a jump table. TABLE is a jump table with |
77 | absolute offsets. INDEX is a register contains the index into the |
78 | jump table. SCALE is the scale of INDEX. */ |
79 | |
80 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ |
81 | jmp *TABLE(, INDEX, SCALE) |
82 | # endif |
83 | |
84 | .section .text.ssse3,"ax" ,@progbits |
85 | # ifdef SHARED |
86 | ENTRY (MEMCPY_CHK) |
87 | movl 12(%esp), %eax |
88 | cmpl %eax, 16(%esp) |
89 | jb HIDDEN_JUMPTARGET (__chk_fail) |
90 | END (MEMCPY_CHK) |
91 | # endif |
92 | ENTRY (MEMCPY) |
93 | ENTRANCE |
94 | movl LEN(%esp), %ecx |
95 | movl SRC(%esp), %eax |
96 | movl DEST(%esp), %edx |
97 | |
98 | # ifdef USE_AS_MEMMOVE |
99 | cmp %eax, %edx |
100 | jb L(copy_forward) |
101 | je L(fwd_write_0bytes) |
102 | cmp $32, %ecx |
103 | jae L(memmove_bwd) |
104 | jmp L(bk_write_less32bytes_2) |
105 | |
106 | .p2align 4 |
107 | L(memmove_bwd): |
108 | add %ecx, %eax |
109 | cmp %eax, %edx |
110 | movl SRC(%esp), %eax |
111 | jb L(copy_backward) |
112 | |
113 | L(copy_forward): |
114 | # endif |
115 | cmp $48, %ecx |
116 | jae L(48bytesormore) |
117 | |
118 | L(fwd_write_less32bytes): |
119 | # ifndef USE_AS_MEMMOVE |
120 | cmp %dl, %al |
121 | jb L(bk_write) |
122 | # endif |
123 | add %ecx, %edx |
124 | add %ecx, %eax |
125 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
126 | # ifndef USE_AS_MEMMOVE |
127 | .p2align 4 |
128 | L(bk_write): |
129 | BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) |
130 | # endif |
131 | |
132 | .p2align 4 |
133 | L(48bytesormore): |
134 | # ifndef USE_AS_MEMMOVE |
135 | movlpd (%eax), %xmm0 |
136 | movlpd 8(%eax), %xmm1 |
137 | movlpd %xmm0, (%edx) |
138 | movlpd %xmm1, 8(%edx) |
139 | # else |
140 | movdqu (%eax), %xmm0 |
141 | # endif |
142 | PUSH (%edi) |
143 | movl %edx, %edi |
144 | and $-16, %edx |
145 | add $16, %edx |
146 | sub %edx, %edi |
147 | add %edi, %ecx |
148 | sub %edi, %eax |
149 | |
150 | # ifdef SHARED_CACHE_SIZE_HALF |
151 | cmp $SHARED_CACHE_SIZE_HALF, %ecx |
152 | # else |
153 | # ifdef PIC |
154 | SETUP_PIC_REG(bx) |
155 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
156 | cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx |
157 | # else |
158 | cmp __x86_shared_cache_size_half, %ecx |
159 | # endif |
160 | # endif |
161 | |
162 | mov %eax, %edi |
163 | jae L(large_page) |
164 | and $0xf, %edi |
165 | jz L(shl_0) |
166 | BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) |
167 | |
168 | .p2align 4 |
169 | L(shl_0): |
170 | # ifdef USE_AS_MEMMOVE |
171 | movl DEST+4(%esp), %edi |
172 | movdqu %xmm0, (%edi) |
173 | # endif |
174 | xor %edi, %edi |
175 | cmp $127, %ecx |
176 | ja L(shl_0_gobble) |
177 | lea -32(%ecx), %ecx |
178 | |
179 | .p2align 4 |
180 | L(shl_0_loop): |
181 | movdqa (%eax, %edi), %xmm0 |
182 | movdqa 16(%eax, %edi), %xmm1 |
183 | sub $32, %ecx |
184 | movdqa %xmm0, (%edx, %edi) |
185 | movdqa %xmm1, 16(%edx, %edi) |
186 | lea 32(%edi), %edi |
187 | jb L(shl_0_end) |
188 | |
189 | movdqa (%eax, %edi), %xmm0 |
190 | movdqa 16(%eax, %edi), %xmm1 |
191 | sub $32, %ecx |
192 | movdqa %xmm0, (%edx, %edi) |
193 | movdqa %xmm1, 16(%edx, %edi) |
194 | lea 32(%edi), %edi |
195 | jb L(shl_0_end) |
196 | |
197 | movdqa (%eax, %edi), %xmm0 |
198 | movdqa 16(%eax, %edi), %xmm1 |
199 | sub $32, %ecx |
200 | movdqa %xmm0, (%edx, %edi) |
201 | movdqa %xmm1, 16(%edx, %edi) |
202 | lea 32(%edi), %edi |
203 | jb L(shl_0_end) |
204 | |
205 | movdqa (%eax, %edi), %xmm0 |
206 | movdqa 16(%eax, %edi), %xmm1 |
207 | sub $32, %ecx |
208 | movdqa %xmm0, (%edx, %edi) |
209 | movdqa %xmm1, 16(%edx, %edi) |
210 | lea 32(%edi), %edi |
211 | |
212 | L(shl_0_end): |
213 | lea 32(%ecx), %ecx |
214 | add %ecx, %edi |
215 | add %edi, %edx |
216 | add %edi, %eax |
217 | POP (%edi) |
218 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) |
219 | |
220 | CFI_PUSH (%edi) |
221 | |
222 | .p2align 4 |
223 | L(shl_0_gobble): |
224 | # ifdef DATA_CACHE_SIZE_HALF |
225 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
226 | # else |
227 | # ifdef PIC |
228 | SETUP_PIC_REG(bx) |
229 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
230 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
231 | # else |
232 | cmp __x86_data_cache_size_half, %ecx |
233 | # endif |
234 | # endif |
235 | POP (%edi) |
236 | lea -128(%ecx), %ecx |
237 | jae L(shl_0_gobble_mem_loop) |
238 | |
239 | .p2align 4 |
240 | L(shl_0_gobble_cache_loop): |
241 | movdqa (%eax), %xmm0 |
242 | movdqa 0x10(%eax), %xmm1 |
243 | movdqa 0x20(%eax), %xmm2 |
244 | movdqa 0x30(%eax), %xmm3 |
245 | movdqa 0x40(%eax), %xmm4 |
246 | movdqa 0x50(%eax), %xmm5 |
247 | movdqa 0x60(%eax), %xmm6 |
248 | movdqa 0x70(%eax), %xmm7 |
249 | lea 0x80(%eax), %eax |
250 | sub $128, %ecx |
251 | movdqa %xmm0, (%edx) |
252 | movdqa %xmm1, 0x10(%edx) |
253 | movdqa %xmm2, 0x20(%edx) |
254 | movdqa %xmm3, 0x30(%edx) |
255 | movdqa %xmm4, 0x40(%edx) |
256 | movdqa %xmm5, 0x50(%edx) |
257 | movdqa %xmm6, 0x60(%edx) |
258 | movdqa %xmm7, 0x70(%edx) |
259 | lea 0x80(%edx), %edx |
260 | |
261 | jae L(shl_0_gobble_cache_loop) |
262 | cmp $-0x40, %ecx |
263 | lea 0x80(%ecx), %ecx |
264 | jl L(shl_0_cache_less_64bytes) |
265 | |
266 | movdqa (%eax), %xmm0 |
267 | sub $0x40, %ecx |
268 | movdqa 0x10(%eax), %xmm1 |
269 | movdqa %xmm0, (%edx) |
270 | movdqa %xmm1, 0x10(%edx) |
271 | movdqa 0x20(%eax), %xmm0 |
272 | movdqa 0x30(%eax), %xmm1 |
273 | add $0x40, %eax |
274 | movdqa %xmm0, 0x20(%edx) |
275 | movdqa %xmm1, 0x30(%edx) |
276 | add $0x40, %edx |
277 | |
278 | L(shl_0_cache_less_64bytes): |
279 | cmp $0x20, %ecx |
280 | jb L(shl_0_cache_less_32bytes) |
281 | movdqa (%eax), %xmm0 |
282 | sub $0x20, %ecx |
283 | movdqa 0x10(%eax), %xmm1 |
284 | add $0x20, %eax |
285 | movdqa %xmm0, (%edx) |
286 | movdqa %xmm1, 0x10(%edx) |
287 | add $0x20, %edx |
288 | |
289 | L(shl_0_cache_less_32bytes): |
290 | cmp $0x10, %ecx |
291 | jb L(shl_0_cache_less_16bytes) |
292 | sub $0x10, %ecx |
293 | movdqa (%eax), %xmm0 |
294 | add $0x10, %eax |
295 | movdqa %xmm0, (%edx) |
296 | add $0x10, %edx |
297 | |
298 | L(shl_0_cache_less_16bytes): |
299 | add %ecx, %edx |
300 | add %ecx, %eax |
301 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
302 | |
303 | .p2align 4 |
304 | L(shl_0_gobble_mem_loop): |
305 | prefetcht0 0x1c0(%eax) |
306 | prefetcht0 0x280(%eax) |
307 | prefetcht0 0x1c0(%edx) |
308 | |
309 | movdqa (%eax), %xmm0 |
310 | movdqa 0x10(%eax), %xmm1 |
311 | movdqa 0x20(%eax), %xmm2 |
312 | movdqa 0x30(%eax), %xmm3 |
313 | movdqa 0x40(%eax), %xmm4 |
314 | movdqa 0x50(%eax), %xmm5 |
315 | movdqa 0x60(%eax), %xmm6 |
316 | movdqa 0x70(%eax), %xmm7 |
317 | lea 0x80(%eax), %eax |
318 | sub $0x80, %ecx |
319 | movdqa %xmm0, (%edx) |
320 | movdqa %xmm1, 0x10(%edx) |
321 | movdqa %xmm2, 0x20(%edx) |
322 | movdqa %xmm3, 0x30(%edx) |
323 | movdqa %xmm4, 0x40(%edx) |
324 | movdqa %xmm5, 0x50(%edx) |
325 | movdqa %xmm6, 0x60(%edx) |
326 | movdqa %xmm7, 0x70(%edx) |
327 | lea 0x80(%edx), %edx |
328 | |
329 | jae L(shl_0_gobble_mem_loop) |
330 | cmp $-0x40, %ecx |
331 | lea 0x80(%ecx), %ecx |
332 | jl L(shl_0_mem_less_64bytes) |
333 | |
334 | movdqa (%eax), %xmm0 |
335 | sub $0x40, %ecx |
336 | movdqa 0x10(%eax), %xmm1 |
337 | |
338 | movdqa %xmm0, (%edx) |
339 | movdqa %xmm1, 0x10(%edx) |
340 | |
341 | movdqa 0x20(%eax), %xmm0 |
342 | movdqa 0x30(%eax), %xmm1 |
343 | add $0x40, %eax |
344 | |
345 | movdqa %xmm0, 0x20(%edx) |
346 | movdqa %xmm1, 0x30(%edx) |
347 | add $0x40, %edx |
348 | |
349 | L(shl_0_mem_less_64bytes): |
350 | cmp $0x20, %ecx |
351 | jb L(shl_0_mem_less_32bytes) |
352 | movdqa (%eax), %xmm0 |
353 | sub $0x20, %ecx |
354 | movdqa 0x10(%eax), %xmm1 |
355 | add $0x20, %eax |
356 | movdqa %xmm0, (%edx) |
357 | movdqa %xmm1, 0x10(%edx) |
358 | add $0x20, %edx |
359 | |
360 | L(shl_0_mem_less_32bytes): |
361 | cmp $0x10, %ecx |
362 | jb L(shl_0_mem_less_16bytes) |
363 | sub $0x10, %ecx |
364 | movdqa (%eax), %xmm0 |
365 | add $0x10, %eax |
366 | movdqa %xmm0, (%edx) |
367 | add $0x10, %edx |
368 | |
369 | L(shl_0_mem_less_16bytes): |
370 | add %ecx, %edx |
371 | add %ecx, %eax |
372 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) |
373 | |
374 | .p2align 4 |
375 | L(shl_1): |
376 | # ifndef USE_AS_MEMMOVE |
377 | movaps -1(%eax), %xmm1 |
378 | # else |
379 | movl DEST+4(%esp), %edi |
380 | movaps -1(%eax), %xmm1 |
381 | movdqu %xmm0, (%edi) |
382 | # endif |
383 | # ifdef DATA_CACHE_SIZE_HALF |
384 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
385 | # else |
386 | # ifdef PIC |
387 | SETUP_PIC_REG(bx) |
388 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
389 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
390 | # else |
391 | cmp __x86_data_cache_size_half, %ecx |
392 | # endif |
393 | # endif |
394 | jb L(sh_1_no_prefetch) |
395 | |
396 | lea -64(%ecx), %ecx |
397 | |
398 | .p2align 4 |
399 | L(Shl1LoopStart): |
400 | prefetcht0 0x1c0(%eax) |
401 | prefetcht0 0x1c0(%edx) |
402 | movaps 15(%eax), %xmm2 |
403 | movaps 31(%eax), %xmm3 |
404 | movaps 47(%eax), %xmm4 |
405 | movaps 63(%eax), %xmm5 |
406 | movaps %xmm5, %xmm7 |
407 | palignr $1, %xmm4, %xmm5 |
408 | palignr $1, %xmm3, %xmm4 |
409 | movaps %xmm5, 48(%edx) |
410 | palignr $1, %xmm2, %xmm3 |
411 | lea 64(%eax), %eax |
412 | palignr $1, %xmm1, %xmm2 |
413 | movaps %xmm4, 32(%edx) |
414 | movaps %xmm3, 16(%edx) |
415 | movaps %xmm7, %xmm1 |
416 | movaps %xmm2, (%edx) |
417 | lea 64(%edx), %edx |
418 | sub $64, %ecx |
419 | ja L(Shl1LoopStart) |
420 | |
421 | L(Shl1LoopLeave): |
422 | add $32, %ecx |
423 | jle L(shl_end_0) |
424 | |
425 | movaps 15(%eax), %xmm2 |
426 | movaps 31(%eax), %xmm3 |
427 | palignr $1, %xmm2, %xmm3 |
428 | palignr $1, %xmm1, %xmm2 |
429 | movaps %xmm2, (%edx) |
430 | movaps %xmm3, 16(%edx) |
431 | lea 32(%edx, %ecx), %edx |
432 | lea 32(%eax, %ecx), %eax |
433 | POP (%edi) |
434 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
435 | |
436 | CFI_PUSH (%edi) |
437 | |
438 | .p2align 4 |
439 | L(sh_1_no_prefetch): |
440 | lea -32(%ecx), %ecx |
441 | lea -1(%eax), %eax |
442 | xor %edi, %edi |
443 | |
444 | .p2align 4 |
445 | L(sh_1_no_prefetch_loop): |
446 | movdqa 16(%eax, %edi), %xmm2 |
447 | sub $32, %ecx |
448 | movdqa 32(%eax, %edi), %xmm3 |
449 | movdqa %xmm3, %xmm4 |
450 | palignr $1, %xmm2, %xmm3 |
451 | palignr $1, %xmm1, %xmm2 |
452 | lea 32(%edi), %edi |
453 | movdqa %xmm2, -32(%edx, %edi) |
454 | movdqa %xmm3, -16(%edx, %edi) |
455 | jb L(sh_1_end_no_prefetch_loop) |
456 | |
457 | movdqa 16(%eax, %edi), %xmm2 |
458 | sub $32, %ecx |
459 | movdqa 32(%eax, %edi), %xmm3 |
460 | movdqa %xmm3, %xmm1 |
461 | palignr $1, %xmm2, %xmm3 |
462 | palignr $1, %xmm4, %xmm2 |
463 | lea 32(%edi), %edi |
464 | movdqa %xmm2, -32(%edx, %edi) |
465 | movdqa %xmm3, -16(%edx, %edi) |
466 | jae L(sh_1_no_prefetch_loop) |
467 | |
468 | L(sh_1_end_no_prefetch_loop): |
469 | lea 32(%ecx), %ecx |
470 | add %ecx, %edi |
471 | add %edi, %edx |
472 | lea 1(%edi, %eax), %eax |
473 | POP (%edi) |
474 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
475 | |
476 | CFI_PUSH (%edi) |
477 | |
478 | .p2align 4 |
479 | L(shl_2): |
480 | # ifndef USE_AS_MEMMOVE |
481 | movaps -2(%eax), %xmm1 |
482 | # else |
483 | movl DEST+4(%esp), %edi |
484 | movaps -2(%eax), %xmm1 |
485 | movdqu %xmm0, (%edi) |
486 | # endif |
487 | # ifdef DATA_CACHE_SIZE_HALF |
488 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
489 | # else |
490 | # ifdef PIC |
491 | SETUP_PIC_REG(bx) |
492 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
493 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
494 | # else |
495 | cmp __x86_data_cache_size_half, %ecx |
496 | # endif |
497 | # endif |
498 | jb L(sh_2_no_prefetch) |
499 | |
500 | lea -64(%ecx), %ecx |
501 | |
502 | .p2align 4 |
503 | L(Shl2LoopStart): |
504 | prefetcht0 0x1c0(%eax) |
505 | prefetcht0 0x1c0(%edx) |
506 | movaps 14(%eax), %xmm2 |
507 | movaps 30(%eax), %xmm3 |
508 | movaps 46(%eax), %xmm4 |
509 | movaps 62(%eax), %xmm5 |
510 | movaps %xmm5, %xmm7 |
511 | palignr $2, %xmm4, %xmm5 |
512 | palignr $2, %xmm3, %xmm4 |
513 | movaps %xmm5, 48(%edx) |
514 | palignr $2, %xmm2, %xmm3 |
515 | lea 64(%eax), %eax |
516 | palignr $2, %xmm1, %xmm2 |
517 | movaps %xmm4, 32(%edx) |
518 | movaps %xmm3, 16(%edx) |
519 | movaps %xmm7, %xmm1 |
520 | movaps %xmm2, (%edx) |
521 | lea 64(%edx), %edx |
522 | sub $64, %ecx |
523 | ja L(Shl2LoopStart) |
524 | |
525 | L(Shl2LoopLeave): |
526 | add $32, %ecx |
527 | jle L(shl_end_0) |
528 | |
529 | movaps 14(%eax), %xmm2 |
530 | movaps 30(%eax), %xmm3 |
531 | palignr $2, %xmm2, %xmm3 |
532 | palignr $2, %xmm1, %xmm2 |
533 | movaps %xmm2, (%edx) |
534 | movaps %xmm3, 16(%edx) |
535 | lea 32(%edx, %ecx), %edx |
536 | lea 32(%eax, %ecx), %eax |
537 | POP (%edi) |
538 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
539 | |
540 | CFI_PUSH (%edi) |
541 | |
542 | .p2align 4 |
543 | L(sh_2_no_prefetch): |
544 | lea -32(%ecx), %ecx |
545 | lea -2(%eax), %eax |
546 | xor %edi, %edi |
547 | |
548 | .p2align 4 |
549 | L(sh_2_no_prefetch_loop): |
550 | movdqa 16(%eax, %edi), %xmm2 |
551 | sub $32, %ecx |
552 | movdqa 32(%eax, %edi), %xmm3 |
553 | movdqa %xmm3, %xmm4 |
554 | palignr $2, %xmm2, %xmm3 |
555 | palignr $2, %xmm1, %xmm2 |
556 | lea 32(%edi), %edi |
557 | movdqa %xmm2, -32(%edx, %edi) |
558 | movdqa %xmm3, -16(%edx, %edi) |
559 | jb L(sh_2_end_no_prefetch_loop) |
560 | |
561 | movdqa 16(%eax, %edi), %xmm2 |
562 | sub $32, %ecx |
563 | movdqa 32(%eax, %edi), %xmm3 |
564 | movdqa %xmm3, %xmm1 |
565 | palignr $2, %xmm2, %xmm3 |
566 | palignr $2, %xmm4, %xmm2 |
567 | lea 32(%edi), %edi |
568 | movdqa %xmm2, -32(%edx, %edi) |
569 | movdqa %xmm3, -16(%edx, %edi) |
570 | jae L(sh_2_no_prefetch_loop) |
571 | |
572 | L(sh_2_end_no_prefetch_loop): |
573 | lea 32(%ecx), %ecx |
574 | add %ecx, %edi |
575 | add %edi, %edx |
576 | lea 2(%edi, %eax), %eax |
577 | POP (%edi) |
578 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
579 | |
580 | CFI_PUSH (%edi) |
581 | |
582 | .p2align 4 |
583 | L(shl_3): |
584 | # ifndef USE_AS_MEMMOVE |
585 | movaps -3(%eax), %xmm1 |
586 | # else |
587 | movl DEST+4(%esp), %edi |
588 | movaps -3(%eax), %xmm1 |
589 | movdqu %xmm0, (%edi) |
590 | # endif |
591 | # ifdef DATA_CACHE_SIZE_HALF |
592 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
593 | # else |
594 | # ifdef PIC |
595 | SETUP_PIC_REG(bx) |
596 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
597 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
598 | # else |
599 | cmp __x86_data_cache_size_half, %ecx |
600 | # endif |
601 | # endif |
602 | jb L(sh_3_no_prefetch) |
603 | |
604 | lea -64(%ecx), %ecx |
605 | |
606 | .p2align 4 |
607 | L(Shl3LoopStart): |
608 | prefetcht0 0x1c0(%eax) |
609 | prefetcht0 0x1c0(%edx) |
610 | movaps 13(%eax), %xmm2 |
611 | movaps 29(%eax), %xmm3 |
612 | movaps 45(%eax), %xmm4 |
613 | movaps 61(%eax), %xmm5 |
614 | movaps %xmm5, %xmm7 |
615 | palignr $3, %xmm4, %xmm5 |
616 | palignr $3, %xmm3, %xmm4 |
617 | movaps %xmm5, 48(%edx) |
618 | palignr $3, %xmm2, %xmm3 |
619 | lea 64(%eax), %eax |
620 | palignr $3, %xmm1, %xmm2 |
621 | movaps %xmm4, 32(%edx) |
622 | movaps %xmm3, 16(%edx) |
623 | movaps %xmm7, %xmm1 |
624 | movaps %xmm2, (%edx) |
625 | lea 64(%edx), %edx |
626 | sub $64, %ecx |
627 | ja L(Shl3LoopStart) |
628 | |
629 | L(Shl3LoopLeave): |
630 | add $32, %ecx |
631 | jle L(shl_end_0) |
632 | |
633 | movaps 13(%eax), %xmm2 |
634 | movaps 29(%eax), %xmm3 |
635 | palignr $3, %xmm2, %xmm3 |
636 | palignr $3, %xmm1, %xmm2 |
637 | movaps %xmm2, (%edx) |
638 | movaps %xmm3, 16(%edx) |
639 | lea 32(%edx, %ecx), %edx |
640 | lea 32(%eax, %ecx), %eax |
641 | POP (%edi) |
642 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
643 | |
644 | CFI_PUSH (%edi) |
645 | |
646 | .p2align 4 |
647 | L(sh_3_no_prefetch): |
648 | lea -32(%ecx), %ecx |
649 | lea -3(%eax), %eax |
650 | xor %edi, %edi |
651 | |
652 | .p2align 4 |
653 | L(sh_3_no_prefetch_loop): |
654 | movdqa 16(%eax, %edi), %xmm2 |
655 | sub $32, %ecx |
656 | movdqa 32(%eax, %edi), %xmm3 |
657 | movdqa %xmm3, %xmm4 |
658 | palignr $3, %xmm2, %xmm3 |
659 | palignr $3, %xmm1, %xmm2 |
660 | lea 32(%edi), %edi |
661 | movdqa %xmm2, -32(%edx, %edi) |
662 | movdqa %xmm3, -16(%edx, %edi) |
663 | |
664 | jb L(sh_3_end_no_prefetch_loop) |
665 | |
666 | movdqa 16(%eax, %edi), %xmm2 |
667 | sub $32, %ecx |
668 | movdqa 32(%eax, %edi), %xmm3 |
669 | movdqa %xmm3, %xmm1 |
670 | palignr $3, %xmm2, %xmm3 |
671 | palignr $3, %xmm4, %xmm2 |
672 | lea 32(%edi), %edi |
673 | movdqa %xmm2, -32(%edx, %edi) |
674 | movdqa %xmm3, -16(%edx, %edi) |
675 | |
676 | jae L(sh_3_no_prefetch_loop) |
677 | |
678 | L(sh_3_end_no_prefetch_loop): |
679 | lea 32(%ecx), %ecx |
680 | add %ecx, %edi |
681 | add %edi, %edx |
682 | lea 3(%edi, %eax), %eax |
683 | POP (%edi) |
684 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
685 | |
686 | CFI_PUSH (%edi) |
687 | |
688 | .p2align 4 |
689 | L(shl_4): |
690 | # ifndef USE_AS_MEMMOVE |
691 | movaps -4(%eax), %xmm1 |
692 | # else |
693 | movl DEST+4(%esp), %edi |
694 | movaps -4(%eax), %xmm1 |
695 | movdqu %xmm0, (%edi) |
696 | # endif |
697 | # ifdef DATA_CACHE_SIZE_HALF |
698 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
699 | # else |
700 | # ifdef PIC |
701 | SETUP_PIC_REG(bx) |
702 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
703 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
704 | # else |
705 | cmp __x86_data_cache_size_half, %ecx |
706 | # endif |
707 | # endif |
708 | jb L(sh_4_no_prefetch) |
709 | |
710 | lea -64(%ecx), %ecx |
711 | |
712 | .p2align 4 |
713 | L(Shl4LoopStart): |
714 | prefetcht0 0x1c0(%eax) |
715 | prefetcht0 0x1c0(%edx) |
716 | movaps 12(%eax), %xmm2 |
717 | movaps 28(%eax), %xmm3 |
718 | movaps 44(%eax), %xmm4 |
719 | movaps 60(%eax), %xmm5 |
720 | movaps %xmm5, %xmm7 |
721 | palignr $4, %xmm4, %xmm5 |
722 | palignr $4, %xmm3, %xmm4 |
723 | movaps %xmm5, 48(%edx) |
724 | palignr $4, %xmm2, %xmm3 |
725 | lea 64(%eax), %eax |
726 | palignr $4, %xmm1, %xmm2 |
727 | movaps %xmm4, 32(%edx) |
728 | movaps %xmm3, 16(%edx) |
729 | movaps %xmm7, %xmm1 |
730 | movaps %xmm2, (%edx) |
731 | lea 64(%edx), %edx |
732 | sub $64, %ecx |
733 | ja L(Shl4LoopStart) |
734 | |
735 | L(Shl4LoopLeave): |
736 | add $32, %ecx |
737 | jle L(shl_end_0) |
738 | |
739 | movaps 12(%eax), %xmm2 |
740 | movaps 28(%eax), %xmm3 |
741 | palignr $4, %xmm2, %xmm3 |
742 | palignr $4, %xmm1, %xmm2 |
743 | movaps %xmm2, (%edx) |
744 | movaps %xmm3, 16(%edx) |
745 | lea 32(%edx, %ecx), %edx |
746 | lea 32(%eax, %ecx), %eax |
747 | POP (%edi) |
748 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
749 | |
750 | CFI_PUSH (%edi) |
751 | |
752 | .p2align 4 |
753 | L(sh_4_no_prefetch): |
754 | lea -32(%ecx), %ecx |
755 | lea -4(%eax), %eax |
756 | xor %edi, %edi |
757 | |
758 | .p2align 4 |
759 | L(sh_4_no_prefetch_loop): |
760 | movdqa 16(%eax, %edi), %xmm2 |
761 | sub $32, %ecx |
762 | movdqa 32(%eax, %edi), %xmm3 |
763 | movdqa %xmm3, %xmm4 |
764 | palignr $4, %xmm2, %xmm3 |
765 | palignr $4, %xmm1, %xmm2 |
766 | lea 32(%edi), %edi |
767 | movdqa %xmm2, -32(%edx, %edi) |
768 | movdqa %xmm3, -16(%edx, %edi) |
769 | |
770 | jb L(sh_4_end_no_prefetch_loop) |
771 | |
772 | movdqa 16(%eax, %edi), %xmm2 |
773 | sub $32, %ecx |
774 | movdqa 32(%eax, %edi), %xmm3 |
775 | movdqa %xmm3, %xmm1 |
776 | palignr $4, %xmm2, %xmm3 |
777 | palignr $4, %xmm4, %xmm2 |
778 | lea 32(%edi), %edi |
779 | movdqa %xmm2, -32(%edx, %edi) |
780 | movdqa %xmm3, -16(%edx, %edi) |
781 | |
782 | jae L(sh_4_no_prefetch_loop) |
783 | |
784 | L(sh_4_end_no_prefetch_loop): |
785 | lea 32(%ecx), %ecx |
786 | add %ecx, %edi |
787 | add %edi, %edx |
788 | lea 4(%edi, %eax), %eax |
789 | POP (%edi) |
790 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
791 | |
792 | CFI_PUSH (%edi) |
793 | |
794 | .p2align 4 |
795 | L(shl_5): |
796 | # ifndef USE_AS_MEMMOVE |
797 | movaps -5(%eax), %xmm1 |
798 | # else |
799 | movl DEST+4(%esp), %edi |
800 | movaps -5(%eax), %xmm1 |
801 | movdqu %xmm0, (%edi) |
802 | # endif |
803 | # ifdef DATA_CACHE_SIZE_HALF |
804 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
805 | # else |
806 | # ifdef PIC |
807 | SETUP_PIC_REG(bx) |
808 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
809 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
810 | # else |
811 | cmp __x86_data_cache_size_half, %ecx |
812 | # endif |
813 | # endif |
814 | jb L(sh_5_no_prefetch) |
815 | |
816 | lea -64(%ecx), %ecx |
817 | |
818 | .p2align 4 |
819 | L(Shl5LoopStart): |
820 | prefetcht0 0x1c0(%eax) |
821 | prefetcht0 0x1c0(%edx) |
822 | movaps 11(%eax), %xmm2 |
823 | movaps 27(%eax), %xmm3 |
824 | movaps 43(%eax), %xmm4 |
825 | movaps 59(%eax), %xmm5 |
826 | movaps %xmm5, %xmm7 |
827 | palignr $5, %xmm4, %xmm5 |
828 | palignr $5, %xmm3, %xmm4 |
829 | movaps %xmm5, 48(%edx) |
830 | palignr $5, %xmm2, %xmm3 |
831 | lea 64(%eax), %eax |
832 | palignr $5, %xmm1, %xmm2 |
833 | movaps %xmm4, 32(%edx) |
834 | movaps %xmm3, 16(%edx) |
835 | movaps %xmm7, %xmm1 |
836 | movaps %xmm2, (%edx) |
837 | lea 64(%edx), %edx |
838 | sub $64, %ecx |
839 | ja L(Shl5LoopStart) |
840 | |
841 | L(Shl5LoopLeave): |
842 | add $32, %ecx |
843 | jle L(shl_end_0) |
844 | |
845 | movaps 11(%eax), %xmm2 |
846 | movaps 27(%eax), %xmm3 |
847 | palignr $5, %xmm2, %xmm3 |
848 | palignr $5, %xmm1, %xmm2 |
849 | movaps %xmm2, (%edx) |
850 | movaps %xmm3, 16(%edx) |
851 | lea 32(%edx, %ecx), %edx |
852 | lea 32(%eax, %ecx), %eax |
853 | POP (%edi) |
854 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
855 | |
856 | CFI_PUSH (%edi) |
857 | |
858 | .p2align 4 |
859 | L(sh_5_no_prefetch): |
860 | lea -32(%ecx), %ecx |
861 | lea -5(%eax), %eax |
862 | xor %edi, %edi |
863 | |
864 | .p2align 4 |
865 | L(sh_5_no_prefetch_loop): |
866 | movdqa 16(%eax, %edi), %xmm2 |
867 | sub $32, %ecx |
868 | movdqa 32(%eax, %edi), %xmm3 |
869 | movdqa %xmm3, %xmm4 |
870 | palignr $5, %xmm2, %xmm3 |
871 | palignr $5, %xmm1, %xmm2 |
872 | lea 32(%edi), %edi |
873 | movdqa %xmm2, -32(%edx, %edi) |
874 | movdqa %xmm3, -16(%edx, %edi) |
875 | |
876 | jb L(sh_5_end_no_prefetch_loop) |
877 | |
878 | movdqa 16(%eax, %edi), %xmm2 |
879 | sub $32, %ecx |
880 | movdqa 32(%eax, %edi), %xmm3 |
881 | movdqa %xmm3, %xmm1 |
882 | palignr $5, %xmm2, %xmm3 |
883 | palignr $5, %xmm4, %xmm2 |
884 | lea 32(%edi), %edi |
885 | movdqa %xmm2, -32(%edx, %edi) |
886 | movdqa %xmm3, -16(%edx, %edi) |
887 | |
888 | jae L(sh_5_no_prefetch_loop) |
889 | |
890 | L(sh_5_end_no_prefetch_loop): |
891 | lea 32(%ecx), %ecx |
892 | add %ecx, %edi |
893 | add %edi, %edx |
894 | lea 5(%edi, %eax), %eax |
895 | POP (%edi) |
896 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
897 | |
898 | CFI_PUSH (%edi) |
899 | |
900 | .p2align 4 |
901 | L(shl_6): |
902 | # ifndef USE_AS_MEMMOVE |
903 | movaps -6(%eax), %xmm1 |
904 | # else |
905 | movl DEST+4(%esp), %edi |
906 | movaps -6(%eax), %xmm1 |
907 | movdqu %xmm0, (%edi) |
908 | # endif |
909 | # ifdef DATA_CACHE_SIZE_HALF |
910 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
911 | # else |
912 | # ifdef PIC |
913 | SETUP_PIC_REG(bx) |
914 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
915 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
916 | # else |
917 | cmp __x86_data_cache_size_half, %ecx |
918 | # endif |
919 | # endif |
920 | jb L(sh_6_no_prefetch) |
921 | |
922 | lea -64(%ecx), %ecx |
923 | |
924 | .p2align 4 |
925 | L(Shl6LoopStart): |
926 | prefetcht0 0x1c0(%eax) |
927 | prefetcht0 0x1c0(%edx) |
928 | movaps 10(%eax), %xmm2 |
929 | movaps 26(%eax), %xmm3 |
930 | movaps 42(%eax), %xmm4 |
931 | movaps 58(%eax), %xmm5 |
932 | movaps %xmm5, %xmm7 |
933 | palignr $6, %xmm4, %xmm5 |
934 | palignr $6, %xmm3, %xmm4 |
935 | movaps %xmm5, 48(%edx) |
936 | palignr $6, %xmm2, %xmm3 |
937 | lea 64(%eax), %eax |
938 | palignr $6, %xmm1, %xmm2 |
939 | movaps %xmm4, 32(%edx) |
940 | movaps %xmm3, 16(%edx) |
941 | movaps %xmm7, %xmm1 |
942 | movaps %xmm2, (%edx) |
943 | lea 64(%edx), %edx |
944 | sub $64, %ecx |
945 | ja L(Shl6LoopStart) |
946 | |
947 | L(Shl6LoopLeave): |
948 | add $32, %ecx |
949 | jle L(shl_end_0) |
950 | |
951 | movaps 10(%eax), %xmm2 |
952 | movaps 26(%eax), %xmm3 |
953 | palignr $6, %xmm2, %xmm3 |
954 | palignr $6, %xmm1, %xmm2 |
955 | movaps %xmm2, (%edx) |
956 | movaps %xmm3, 16(%edx) |
957 | lea 32(%edx, %ecx), %edx |
958 | lea 32(%eax, %ecx), %eax |
959 | POP (%edi) |
960 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
961 | |
962 | CFI_PUSH (%edi) |
963 | |
964 | .p2align 4 |
965 | L(sh_6_no_prefetch): |
966 | lea -32(%ecx), %ecx |
967 | lea -6(%eax), %eax |
968 | xor %edi, %edi |
969 | |
970 | .p2align 4 |
971 | L(sh_6_no_prefetch_loop): |
972 | movdqa 16(%eax, %edi), %xmm2 |
973 | sub $32, %ecx |
974 | movdqa 32(%eax, %edi), %xmm3 |
975 | movdqa %xmm3, %xmm4 |
976 | palignr $6, %xmm2, %xmm3 |
977 | palignr $6, %xmm1, %xmm2 |
978 | lea 32(%edi), %edi |
979 | movdqa %xmm2, -32(%edx, %edi) |
980 | movdqa %xmm3, -16(%edx, %edi) |
981 | |
982 | jb L(sh_6_end_no_prefetch_loop) |
983 | |
984 | movdqa 16(%eax, %edi), %xmm2 |
985 | sub $32, %ecx |
986 | movdqa 32(%eax, %edi), %xmm3 |
987 | movdqa %xmm3, %xmm1 |
988 | palignr $6, %xmm2, %xmm3 |
989 | palignr $6, %xmm4, %xmm2 |
990 | lea 32(%edi), %edi |
991 | movdqa %xmm2, -32(%edx, %edi) |
992 | movdqa %xmm3, -16(%edx, %edi) |
993 | |
994 | jae L(sh_6_no_prefetch_loop) |
995 | |
996 | L(sh_6_end_no_prefetch_loop): |
997 | lea 32(%ecx), %ecx |
998 | add %ecx, %edi |
999 | add %edi, %edx |
1000 | lea 6(%edi, %eax), %eax |
1001 | POP (%edi) |
1002 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1003 | |
1004 | CFI_PUSH (%edi) |
1005 | |
1006 | .p2align 4 |
1007 | L(shl_7): |
1008 | # ifndef USE_AS_MEMMOVE |
1009 | movaps -7(%eax), %xmm1 |
1010 | # else |
1011 | movl DEST+4(%esp), %edi |
1012 | movaps -7(%eax), %xmm1 |
1013 | movdqu %xmm0, (%edi) |
1014 | # endif |
1015 | # ifdef DATA_CACHE_SIZE_HALF |
1016 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1017 | # else |
1018 | # ifdef PIC |
1019 | SETUP_PIC_REG(bx) |
1020 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1021 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1022 | # else |
1023 | cmp __x86_data_cache_size_half, %ecx |
1024 | # endif |
1025 | # endif |
1026 | jb L(sh_7_no_prefetch) |
1027 | |
1028 | lea -64(%ecx), %ecx |
1029 | |
1030 | .p2align 4 |
1031 | L(Shl7LoopStart): |
1032 | prefetcht0 0x1c0(%eax) |
1033 | prefetcht0 0x1c0(%edx) |
1034 | movaps 9(%eax), %xmm2 |
1035 | movaps 25(%eax), %xmm3 |
1036 | movaps 41(%eax), %xmm4 |
1037 | movaps 57(%eax), %xmm5 |
1038 | movaps %xmm5, %xmm7 |
1039 | palignr $7, %xmm4, %xmm5 |
1040 | palignr $7, %xmm3, %xmm4 |
1041 | movaps %xmm5, 48(%edx) |
1042 | palignr $7, %xmm2, %xmm3 |
1043 | lea 64(%eax), %eax |
1044 | palignr $7, %xmm1, %xmm2 |
1045 | movaps %xmm4, 32(%edx) |
1046 | movaps %xmm3, 16(%edx) |
1047 | movaps %xmm7, %xmm1 |
1048 | movaps %xmm2, (%edx) |
1049 | lea 64(%edx), %edx |
1050 | sub $64, %ecx |
1051 | ja L(Shl7LoopStart) |
1052 | |
1053 | L(Shl7LoopLeave): |
1054 | add $32, %ecx |
1055 | jle L(shl_end_0) |
1056 | |
1057 | movaps 9(%eax), %xmm2 |
1058 | movaps 25(%eax), %xmm3 |
1059 | palignr $7, %xmm2, %xmm3 |
1060 | palignr $7, %xmm1, %xmm2 |
1061 | movaps %xmm2, (%edx) |
1062 | movaps %xmm3, 16(%edx) |
1063 | lea 32(%edx, %ecx), %edx |
1064 | lea 32(%eax, %ecx), %eax |
1065 | POP (%edi) |
1066 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1067 | |
1068 | CFI_PUSH (%edi) |
1069 | |
1070 | .p2align 4 |
1071 | L(sh_7_no_prefetch): |
1072 | lea -32(%ecx), %ecx |
1073 | lea -7(%eax), %eax |
1074 | xor %edi, %edi |
1075 | |
1076 | .p2align 4 |
1077 | L(sh_7_no_prefetch_loop): |
1078 | movdqa 16(%eax, %edi), %xmm2 |
1079 | sub $32, %ecx |
1080 | movdqa 32(%eax, %edi), %xmm3 |
1081 | movdqa %xmm3, %xmm4 |
1082 | palignr $7, %xmm2, %xmm3 |
1083 | palignr $7, %xmm1, %xmm2 |
1084 | lea 32(%edi), %edi |
1085 | movdqa %xmm2, -32(%edx, %edi) |
1086 | movdqa %xmm3, -16(%edx, %edi) |
1087 | jb L(sh_7_end_no_prefetch_loop) |
1088 | |
1089 | movdqa 16(%eax, %edi), %xmm2 |
1090 | sub $32, %ecx |
1091 | movdqa 32(%eax, %edi), %xmm3 |
1092 | movdqa %xmm3, %xmm1 |
1093 | palignr $7, %xmm2, %xmm3 |
1094 | palignr $7, %xmm4, %xmm2 |
1095 | lea 32(%edi), %edi |
1096 | movdqa %xmm2, -32(%edx, %edi) |
1097 | movdqa %xmm3, -16(%edx, %edi) |
1098 | jae L(sh_7_no_prefetch_loop) |
1099 | |
1100 | L(sh_7_end_no_prefetch_loop): |
1101 | lea 32(%ecx), %ecx |
1102 | add %ecx, %edi |
1103 | add %edi, %edx |
1104 | lea 7(%edi, %eax), %eax |
1105 | POP (%edi) |
1106 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1107 | |
1108 | CFI_PUSH (%edi) |
1109 | |
1110 | .p2align 4 |
1111 | L(shl_8): |
1112 | # ifndef USE_AS_MEMMOVE |
1113 | movaps -8(%eax), %xmm1 |
1114 | # else |
1115 | movl DEST+4(%esp), %edi |
1116 | movaps -8(%eax), %xmm1 |
1117 | movdqu %xmm0, (%edi) |
1118 | # endif |
1119 | # ifdef DATA_CACHE_SIZE_HALF |
1120 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1121 | # else |
1122 | # ifdef PIC |
1123 | SETUP_PIC_REG(bx) |
1124 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1125 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1126 | # else |
1127 | cmp __x86_data_cache_size_half, %ecx |
1128 | # endif |
1129 | # endif |
1130 | jb L(sh_8_no_prefetch) |
1131 | |
1132 | lea -64(%ecx), %ecx |
1133 | |
1134 | .p2align 4 |
1135 | L(Shl8LoopStart): |
1136 | prefetcht0 0x1c0(%eax) |
1137 | prefetcht0 0x1c0(%edx) |
1138 | movaps 8(%eax), %xmm2 |
1139 | movaps 24(%eax), %xmm3 |
1140 | movaps 40(%eax), %xmm4 |
1141 | movaps 56(%eax), %xmm5 |
1142 | movaps %xmm5, %xmm7 |
1143 | palignr $8, %xmm4, %xmm5 |
1144 | palignr $8, %xmm3, %xmm4 |
1145 | movaps %xmm5, 48(%edx) |
1146 | palignr $8, %xmm2, %xmm3 |
1147 | lea 64(%eax), %eax |
1148 | palignr $8, %xmm1, %xmm2 |
1149 | movaps %xmm4, 32(%edx) |
1150 | movaps %xmm3, 16(%edx) |
1151 | movaps %xmm7, %xmm1 |
1152 | movaps %xmm2, (%edx) |
1153 | lea 64(%edx), %edx |
1154 | sub $64, %ecx |
1155 | ja L(Shl8LoopStart) |
1156 | |
1157 | L(LoopLeave8): |
1158 | add $32, %ecx |
1159 | jle L(shl_end_0) |
1160 | |
1161 | movaps 8(%eax), %xmm2 |
1162 | movaps 24(%eax), %xmm3 |
1163 | palignr $8, %xmm2, %xmm3 |
1164 | palignr $8, %xmm1, %xmm2 |
1165 | movaps %xmm2, (%edx) |
1166 | movaps %xmm3, 16(%edx) |
1167 | lea 32(%edx, %ecx), %edx |
1168 | lea 32(%eax, %ecx), %eax |
1169 | POP (%edi) |
1170 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1171 | |
1172 | CFI_PUSH (%edi) |
1173 | |
1174 | .p2align 4 |
1175 | L(sh_8_no_prefetch): |
1176 | lea -32(%ecx), %ecx |
1177 | lea -8(%eax), %eax |
1178 | xor %edi, %edi |
1179 | |
1180 | .p2align 4 |
1181 | L(sh_8_no_prefetch_loop): |
1182 | movdqa 16(%eax, %edi), %xmm2 |
1183 | sub $32, %ecx |
1184 | movdqa 32(%eax, %edi), %xmm3 |
1185 | movdqa %xmm3, %xmm4 |
1186 | palignr $8, %xmm2, %xmm3 |
1187 | palignr $8, %xmm1, %xmm2 |
1188 | lea 32(%edi), %edi |
1189 | movdqa %xmm2, -32(%edx, %edi) |
1190 | movdqa %xmm3, -16(%edx, %edi) |
1191 | jb L(sh_8_end_no_prefetch_loop) |
1192 | |
1193 | movdqa 16(%eax, %edi), %xmm2 |
1194 | sub $32, %ecx |
1195 | movdqa 32(%eax, %edi), %xmm3 |
1196 | movdqa %xmm3, %xmm1 |
1197 | palignr $8, %xmm2, %xmm3 |
1198 | palignr $8, %xmm4, %xmm2 |
1199 | lea 32(%edi), %edi |
1200 | movdqa %xmm2, -32(%edx, %edi) |
1201 | movdqa %xmm3, -16(%edx, %edi) |
1202 | jae L(sh_8_no_prefetch_loop) |
1203 | |
1204 | L(sh_8_end_no_prefetch_loop): |
1205 | lea 32(%ecx), %ecx |
1206 | add %ecx, %edi |
1207 | add %edi, %edx |
1208 | lea 8(%edi, %eax), %eax |
1209 | POP (%edi) |
1210 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1211 | |
1212 | CFI_PUSH (%edi) |
1213 | |
1214 | .p2align 4 |
1215 | L(shl_9): |
1216 | # ifndef USE_AS_MEMMOVE |
1217 | movaps -9(%eax), %xmm1 |
1218 | # else |
1219 | movl DEST+4(%esp), %edi |
1220 | movaps -9(%eax), %xmm1 |
1221 | movdqu %xmm0, (%edi) |
1222 | # endif |
1223 | # ifdef DATA_CACHE_SIZE_HALF |
1224 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1225 | # else |
1226 | # ifdef PIC |
1227 | SETUP_PIC_REG(bx) |
1228 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1229 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1230 | # else |
1231 | cmp __x86_data_cache_size_half, %ecx |
1232 | # endif |
1233 | # endif |
1234 | jb L(sh_9_no_prefetch) |
1235 | |
1236 | lea -64(%ecx), %ecx |
1237 | |
1238 | .p2align 4 |
1239 | L(Shl9LoopStart): |
1240 | prefetcht0 0x1c0(%eax) |
1241 | prefetcht0 0x1c0(%edx) |
1242 | movaps 7(%eax), %xmm2 |
1243 | movaps 23(%eax), %xmm3 |
1244 | movaps 39(%eax), %xmm4 |
1245 | movaps 55(%eax), %xmm5 |
1246 | movaps %xmm5, %xmm7 |
1247 | palignr $9, %xmm4, %xmm5 |
1248 | palignr $9, %xmm3, %xmm4 |
1249 | movaps %xmm5, 48(%edx) |
1250 | palignr $9, %xmm2, %xmm3 |
1251 | lea 64(%eax), %eax |
1252 | palignr $9, %xmm1, %xmm2 |
1253 | movaps %xmm4, 32(%edx) |
1254 | movaps %xmm3, 16(%edx) |
1255 | movaps %xmm7, %xmm1 |
1256 | movaps %xmm2, (%edx) |
1257 | lea 64(%edx), %edx |
1258 | sub $64, %ecx |
1259 | ja L(Shl9LoopStart) |
1260 | |
1261 | L(Shl9LoopLeave): |
1262 | add $32, %ecx |
1263 | jle L(shl_end_0) |
1264 | |
1265 | movaps 7(%eax), %xmm2 |
1266 | movaps 23(%eax), %xmm3 |
1267 | palignr $9, %xmm2, %xmm3 |
1268 | palignr $9, %xmm1, %xmm2 |
1269 | |
1270 | movaps %xmm2, (%edx) |
1271 | movaps %xmm3, 16(%edx) |
1272 | lea 32(%edx, %ecx), %edx |
1273 | lea 32(%eax, %ecx), %eax |
1274 | POP (%edi) |
1275 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1276 | |
1277 | CFI_PUSH (%edi) |
1278 | |
1279 | .p2align 4 |
1280 | L(sh_9_no_prefetch): |
1281 | lea -32(%ecx), %ecx |
1282 | lea -9(%eax), %eax |
1283 | xor %edi, %edi |
1284 | |
1285 | .p2align 4 |
1286 | L(sh_9_no_prefetch_loop): |
1287 | movdqa 16(%eax, %edi), %xmm2 |
1288 | sub $32, %ecx |
1289 | movdqa 32(%eax, %edi), %xmm3 |
1290 | movdqa %xmm3, %xmm4 |
1291 | palignr $9, %xmm2, %xmm3 |
1292 | palignr $9, %xmm1, %xmm2 |
1293 | lea 32(%edi), %edi |
1294 | movdqa %xmm2, -32(%edx, %edi) |
1295 | movdqa %xmm3, -16(%edx, %edi) |
1296 | jb L(sh_9_end_no_prefetch_loop) |
1297 | |
1298 | movdqa 16(%eax, %edi), %xmm2 |
1299 | sub $32, %ecx |
1300 | movdqa 32(%eax, %edi), %xmm3 |
1301 | movdqa %xmm3, %xmm1 |
1302 | palignr $9, %xmm2, %xmm3 |
1303 | palignr $9, %xmm4, %xmm2 |
1304 | lea 32(%edi), %edi |
1305 | movdqa %xmm2, -32(%edx, %edi) |
1306 | movdqa %xmm3, -16(%edx, %edi) |
1307 | jae L(sh_9_no_prefetch_loop) |
1308 | |
1309 | L(sh_9_end_no_prefetch_loop): |
1310 | lea 32(%ecx), %ecx |
1311 | add %ecx, %edi |
1312 | add %edi, %edx |
1313 | lea 9(%edi, %eax), %eax |
1314 | POP (%edi) |
1315 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1316 | |
1317 | CFI_PUSH (%edi) |
1318 | |
1319 | .p2align 4 |
1320 | L(shl_10): |
1321 | # ifndef USE_AS_MEMMOVE |
1322 | movaps -10(%eax), %xmm1 |
1323 | # else |
1324 | movl DEST+4(%esp), %edi |
1325 | movaps -10(%eax), %xmm1 |
1326 | movdqu %xmm0, (%edi) |
1327 | # endif |
1328 | # ifdef DATA_CACHE_SIZE_HALF |
1329 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1330 | # else |
1331 | # ifdef PIC |
1332 | SETUP_PIC_REG(bx) |
1333 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1334 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1335 | # else |
1336 | cmp __x86_data_cache_size_half, %ecx |
1337 | # endif |
1338 | # endif |
1339 | jb L(sh_10_no_prefetch) |
1340 | |
1341 | lea -64(%ecx), %ecx |
1342 | |
1343 | .p2align 4 |
1344 | L(Shl10LoopStart): |
1345 | prefetcht0 0x1c0(%eax) |
1346 | prefetcht0 0x1c0(%edx) |
1347 | movaps 6(%eax), %xmm2 |
1348 | movaps 22(%eax), %xmm3 |
1349 | movaps 38(%eax), %xmm4 |
1350 | movaps 54(%eax), %xmm5 |
1351 | movaps %xmm5, %xmm7 |
1352 | palignr $10, %xmm4, %xmm5 |
1353 | palignr $10, %xmm3, %xmm4 |
1354 | movaps %xmm5, 48(%edx) |
1355 | palignr $10, %xmm2, %xmm3 |
1356 | lea 64(%eax), %eax |
1357 | palignr $10, %xmm1, %xmm2 |
1358 | movaps %xmm4, 32(%edx) |
1359 | movaps %xmm3, 16(%edx) |
1360 | movaps %xmm7, %xmm1 |
1361 | movaps %xmm2, (%edx) |
1362 | lea 64(%edx), %edx |
1363 | sub $64, %ecx |
1364 | ja L(Shl10LoopStart) |
1365 | |
1366 | L(Shl10LoopLeave): |
1367 | add $32, %ecx |
1368 | jle L(shl_end_0) |
1369 | |
1370 | movaps 6(%eax), %xmm2 |
1371 | movaps 22(%eax), %xmm3 |
1372 | palignr $10, %xmm2, %xmm3 |
1373 | palignr $10, %xmm1, %xmm2 |
1374 | |
1375 | movaps %xmm2, (%edx) |
1376 | movaps %xmm3, 16(%edx) |
1377 | lea 32(%edx, %ecx), %edx |
1378 | lea 32(%eax, %ecx), %eax |
1379 | POP (%edi) |
1380 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1381 | |
1382 | CFI_PUSH (%edi) |
1383 | |
1384 | .p2align 4 |
1385 | L(sh_10_no_prefetch): |
1386 | lea -32(%ecx), %ecx |
1387 | lea -10(%eax), %eax |
1388 | xor %edi, %edi |
1389 | |
1390 | .p2align 4 |
1391 | L(sh_10_no_prefetch_loop): |
1392 | movdqa 16(%eax, %edi), %xmm2 |
1393 | sub $32, %ecx |
1394 | movdqa 32(%eax, %edi), %xmm3 |
1395 | movdqa %xmm3, %xmm4 |
1396 | palignr $10, %xmm2, %xmm3 |
1397 | palignr $10, %xmm1, %xmm2 |
1398 | lea 32(%edi), %edi |
1399 | movdqa %xmm2, -32(%edx, %edi) |
1400 | movdqa %xmm3, -16(%edx, %edi) |
1401 | jb L(sh_10_end_no_prefetch_loop) |
1402 | |
1403 | movdqa 16(%eax, %edi), %xmm2 |
1404 | sub $32, %ecx |
1405 | movdqa 32(%eax, %edi), %xmm3 |
1406 | movdqa %xmm3, %xmm1 |
1407 | palignr $10, %xmm2, %xmm3 |
1408 | palignr $10, %xmm4, %xmm2 |
1409 | lea 32(%edi), %edi |
1410 | movdqa %xmm2, -32(%edx, %edi) |
1411 | movdqa %xmm3, -16(%edx, %edi) |
1412 | jae L(sh_10_no_prefetch_loop) |
1413 | |
1414 | L(sh_10_end_no_prefetch_loop): |
1415 | lea 32(%ecx), %ecx |
1416 | add %ecx, %edi |
1417 | add %edi, %edx |
1418 | lea 10(%edi, %eax), %eax |
1419 | POP (%edi) |
1420 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1421 | |
1422 | CFI_PUSH (%edi) |
1423 | |
1424 | .p2align 4 |
1425 | L(shl_11): |
1426 | # ifndef USE_AS_MEMMOVE |
1427 | movaps -11(%eax), %xmm1 |
1428 | # else |
1429 | movl DEST+4(%esp), %edi |
1430 | movaps -11(%eax), %xmm1 |
1431 | movdqu %xmm0, (%edi) |
1432 | # endif |
1433 | # ifdef DATA_CACHE_SIZE_HALF |
1434 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1435 | # else |
1436 | # ifdef PIC |
1437 | SETUP_PIC_REG(bx) |
1438 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1439 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1440 | # else |
1441 | cmp __x86_data_cache_size_half, %ecx |
1442 | # endif |
1443 | # endif |
1444 | jb L(sh_11_no_prefetch) |
1445 | |
1446 | lea -64(%ecx), %ecx |
1447 | |
1448 | .p2align 4 |
1449 | L(Shl11LoopStart): |
1450 | prefetcht0 0x1c0(%eax) |
1451 | prefetcht0 0x1c0(%edx) |
1452 | movaps 5(%eax), %xmm2 |
1453 | movaps 21(%eax), %xmm3 |
1454 | movaps 37(%eax), %xmm4 |
1455 | movaps 53(%eax), %xmm5 |
1456 | movaps %xmm5, %xmm7 |
1457 | palignr $11, %xmm4, %xmm5 |
1458 | palignr $11, %xmm3, %xmm4 |
1459 | movaps %xmm5, 48(%edx) |
1460 | palignr $11, %xmm2, %xmm3 |
1461 | lea 64(%eax), %eax |
1462 | palignr $11, %xmm1, %xmm2 |
1463 | movaps %xmm4, 32(%edx) |
1464 | movaps %xmm3, 16(%edx) |
1465 | movaps %xmm7, %xmm1 |
1466 | movaps %xmm2, (%edx) |
1467 | lea 64(%edx), %edx |
1468 | sub $64, %ecx |
1469 | ja L(Shl11LoopStart) |
1470 | |
1471 | L(Shl11LoopLeave): |
1472 | add $32, %ecx |
1473 | jle L(shl_end_0) |
1474 | |
1475 | movaps 5(%eax), %xmm2 |
1476 | movaps 21(%eax), %xmm3 |
1477 | palignr $11, %xmm2, %xmm3 |
1478 | palignr $11, %xmm1, %xmm2 |
1479 | |
1480 | movaps %xmm2, (%edx) |
1481 | movaps %xmm3, 16(%edx) |
1482 | lea 32(%edx, %ecx), %edx |
1483 | lea 32(%eax, %ecx), %eax |
1484 | POP (%edi) |
1485 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1486 | |
1487 | CFI_PUSH (%edi) |
1488 | |
1489 | .p2align 4 |
1490 | L(sh_11_no_prefetch): |
1491 | lea -32(%ecx), %ecx |
1492 | lea -11(%eax), %eax |
1493 | xor %edi, %edi |
1494 | |
1495 | .p2align 4 |
1496 | L(sh_11_no_prefetch_loop): |
1497 | movdqa 16(%eax, %edi), %xmm2 |
1498 | sub $32, %ecx |
1499 | movdqa 32(%eax, %edi), %xmm3 |
1500 | movdqa %xmm3, %xmm4 |
1501 | palignr $11, %xmm2, %xmm3 |
1502 | palignr $11, %xmm1, %xmm2 |
1503 | lea 32(%edi), %edi |
1504 | movdqa %xmm2, -32(%edx, %edi) |
1505 | movdqa %xmm3, -16(%edx, %edi) |
1506 | jb L(sh_11_end_no_prefetch_loop) |
1507 | |
1508 | movdqa 16(%eax, %edi), %xmm2 |
1509 | sub $32, %ecx |
1510 | movdqa 32(%eax, %edi), %xmm3 |
1511 | movdqa %xmm3, %xmm1 |
1512 | palignr $11, %xmm2, %xmm3 |
1513 | palignr $11, %xmm4, %xmm2 |
1514 | lea 32(%edi), %edi |
1515 | movdqa %xmm2, -32(%edx, %edi) |
1516 | movdqa %xmm3, -16(%edx, %edi) |
1517 | jae L(sh_11_no_prefetch_loop) |
1518 | |
1519 | L(sh_11_end_no_prefetch_loop): |
1520 | lea 32(%ecx), %ecx |
1521 | add %ecx, %edi |
1522 | add %edi, %edx |
1523 | lea 11(%edi, %eax), %eax |
1524 | POP (%edi) |
1525 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1526 | |
1527 | CFI_PUSH (%edi) |
1528 | |
1529 | .p2align 4 |
1530 | L(shl_12): |
1531 | # ifndef USE_AS_MEMMOVE |
1532 | movaps -12(%eax), %xmm1 |
1533 | # else |
1534 | movl DEST+4(%esp), %edi |
1535 | movaps -12(%eax), %xmm1 |
1536 | movdqu %xmm0, (%edi) |
1537 | # endif |
1538 | # ifdef DATA_CACHE_SIZE_HALF |
1539 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1540 | # else |
1541 | # ifdef PIC |
1542 | SETUP_PIC_REG(bx) |
1543 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1544 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1545 | # else |
1546 | cmp __x86_data_cache_size_half, %ecx |
1547 | # endif |
1548 | # endif |
1549 | jb L(sh_12_no_prefetch) |
1550 | |
1551 | lea -64(%ecx), %ecx |
1552 | |
1553 | .p2align 4 |
1554 | L(Shl12LoopStart): |
1555 | prefetcht0 0x1c0(%eax) |
1556 | prefetcht0 0x1c0(%edx) |
1557 | movaps 4(%eax), %xmm2 |
1558 | movaps 20(%eax), %xmm3 |
1559 | movaps 36(%eax), %xmm4 |
1560 | movaps 52(%eax), %xmm5 |
1561 | movaps %xmm5, %xmm7 |
1562 | palignr $12, %xmm4, %xmm5 |
1563 | palignr $12, %xmm3, %xmm4 |
1564 | movaps %xmm5, 48(%edx) |
1565 | palignr $12, %xmm2, %xmm3 |
1566 | lea 64(%eax), %eax |
1567 | palignr $12, %xmm1, %xmm2 |
1568 | movaps %xmm4, 32(%edx) |
1569 | movaps %xmm3, 16(%edx) |
1570 | movaps %xmm7, %xmm1 |
1571 | movaps %xmm2, (%edx) |
1572 | lea 64(%edx), %edx |
1573 | sub $64, %ecx |
1574 | ja L(Shl12LoopStart) |
1575 | |
1576 | L(Shl12LoopLeave): |
1577 | add $32, %ecx |
1578 | jle L(shl_end_0) |
1579 | |
1580 | movaps 4(%eax), %xmm2 |
1581 | movaps 20(%eax), %xmm3 |
1582 | palignr $12, %xmm2, %xmm3 |
1583 | palignr $12, %xmm1, %xmm2 |
1584 | |
1585 | movaps %xmm2, (%edx) |
1586 | movaps %xmm3, 16(%edx) |
1587 | lea 32(%edx, %ecx), %edx |
1588 | lea 32(%eax, %ecx), %eax |
1589 | POP (%edi) |
1590 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1591 | |
1592 | CFI_PUSH (%edi) |
1593 | |
1594 | .p2align 4 |
1595 | L(sh_12_no_prefetch): |
1596 | lea -32(%ecx), %ecx |
1597 | lea -12(%eax), %eax |
1598 | xor %edi, %edi |
1599 | |
1600 | .p2align 4 |
1601 | L(sh_12_no_prefetch_loop): |
1602 | movdqa 16(%eax, %edi), %xmm2 |
1603 | sub $32, %ecx |
1604 | movdqa 32(%eax, %edi), %xmm3 |
1605 | movdqa %xmm3, %xmm4 |
1606 | palignr $12, %xmm2, %xmm3 |
1607 | palignr $12, %xmm1, %xmm2 |
1608 | lea 32(%edi), %edi |
1609 | movdqa %xmm2, -32(%edx, %edi) |
1610 | movdqa %xmm3, -16(%edx, %edi) |
1611 | jb L(sh_12_end_no_prefetch_loop) |
1612 | |
1613 | movdqa 16(%eax, %edi), %xmm2 |
1614 | sub $32, %ecx |
1615 | movdqa 32(%eax, %edi), %xmm3 |
1616 | movdqa %xmm3, %xmm1 |
1617 | palignr $12, %xmm2, %xmm3 |
1618 | palignr $12, %xmm4, %xmm2 |
1619 | lea 32(%edi), %edi |
1620 | movdqa %xmm2, -32(%edx, %edi) |
1621 | movdqa %xmm3, -16(%edx, %edi) |
1622 | jae L(sh_12_no_prefetch_loop) |
1623 | |
1624 | L(sh_12_end_no_prefetch_loop): |
1625 | lea 32(%ecx), %ecx |
1626 | add %ecx, %edi |
1627 | add %edi, %edx |
1628 | lea 12(%edi, %eax), %eax |
1629 | POP (%edi) |
1630 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1631 | |
1632 | CFI_PUSH (%edi) |
1633 | |
1634 | .p2align 4 |
1635 | L(shl_13): |
1636 | # ifndef USE_AS_MEMMOVE |
1637 | movaps -13(%eax), %xmm1 |
1638 | # else |
1639 | movl DEST+4(%esp), %edi |
1640 | movaps -13(%eax), %xmm1 |
1641 | movdqu %xmm0, (%edi) |
1642 | # endif |
1643 | # ifdef DATA_CACHE_SIZE_HALF |
1644 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1645 | # else |
1646 | # ifdef PIC |
1647 | SETUP_PIC_REG(bx) |
1648 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1649 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1650 | # else |
1651 | cmp __x86_data_cache_size_half, %ecx |
1652 | # endif |
1653 | # endif |
1654 | jb L(sh_13_no_prefetch) |
1655 | |
1656 | lea -64(%ecx), %ecx |
1657 | |
1658 | .p2align 4 |
1659 | L(Shl13LoopStart): |
1660 | prefetcht0 0x1c0(%eax) |
1661 | prefetcht0 0x1c0(%edx) |
1662 | movaps 3(%eax), %xmm2 |
1663 | movaps 19(%eax), %xmm3 |
1664 | movaps 35(%eax), %xmm4 |
1665 | movaps 51(%eax), %xmm5 |
1666 | movaps %xmm5, %xmm7 |
1667 | palignr $13, %xmm4, %xmm5 |
1668 | palignr $13, %xmm3, %xmm4 |
1669 | movaps %xmm5, 48(%edx) |
1670 | palignr $13, %xmm2, %xmm3 |
1671 | lea 64(%eax), %eax |
1672 | palignr $13, %xmm1, %xmm2 |
1673 | movaps %xmm4, 32(%edx) |
1674 | movaps %xmm3, 16(%edx) |
1675 | movaps %xmm7, %xmm1 |
1676 | movaps %xmm2, (%edx) |
1677 | lea 64(%edx), %edx |
1678 | sub $64, %ecx |
1679 | ja L(Shl13LoopStart) |
1680 | |
1681 | L(Shl13LoopLeave): |
1682 | add $32, %ecx |
1683 | jle L(shl_end_0) |
1684 | |
1685 | movaps 3(%eax), %xmm2 |
1686 | movaps 19(%eax), %xmm3 |
1687 | palignr $13, %xmm2, %xmm3 |
1688 | palignr $13, %xmm1, %xmm2 |
1689 | |
1690 | movaps %xmm2, (%edx) |
1691 | movaps %xmm3, 16(%edx) |
1692 | lea 32(%edx, %ecx), %edx |
1693 | lea 32(%eax, %ecx), %eax |
1694 | POP (%edi) |
1695 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1696 | |
1697 | CFI_PUSH (%edi) |
1698 | |
1699 | .p2align 4 |
1700 | L(sh_13_no_prefetch): |
1701 | lea -32(%ecx), %ecx |
1702 | lea -13(%eax), %eax |
1703 | xor %edi, %edi |
1704 | |
1705 | .p2align 4 |
1706 | L(sh_13_no_prefetch_loop): |
1707 | movdqa 16(%eax, %edi), %xmm2 |
1708 | sub $32, %ecx |
1709 | movdqa 32(%eax, %edi), %xmm3 |
1710 | movdqa %xmm3, %xmm4 |
1711 | palignr $13, %xmm2, %xmm3 |
1712 | palignr $13, %xmm1, %xmm2 |
1713 | lea 32(%edi), %edi |
1714 | movdqa %xmm2, -32(%edx, %edi) |
1715 | movdqa %xmm3, -16(%edx, %edi) |
1716 | jb L(sh_13_end_no_prefetch_loop) |
1717 | |
1718 | movdqa 16(%eax, %edi), %xmm2 |
1719 | sub $32, %ecx |
1720 | movdqa 32(%eax, %edi), %xmm3 |
1721 | movdqa %xmm3, %xmm1 |
1722 | palignr $13, %xmm2, %xmm3 |
1723 | palignr $13, %xmm4, %xmm2 |
1724 | lea 32(%edi), %edi |
1725 | movdqa %xmm2, -32(%edx, %edi) |
1726 | movdqa %xmm3, -16(%edx, %edi) |
1727 | jae L(sh_13_no_prefetch_loop) |
1728 | |
1729 | L(sh_13_end_no_prefetch_loop): |
1730 | lea 32(%ecx), %ecx |
1731 | add %ecx, %edi |
1732 | add %edi, %edx |
1733 | lea 13(%edi, %eax), %eax |
1734 | POP (%edi) |
1735 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1736 | |
1737 | CFI_PUSH (%edi) |
1738 | |
1739 | .p2align 4 |
1740 | L(shl_14): |
1741 | # ifndef USE_AS_MEMMOVE |
1742 | movaps -14(%eax), %xmm1 |
1743 | # else |
1744 | movl DEST+4(%esp), %edi |
1745 | movaps -14(%eax), %xmm1 |
1746 | movdqu %xmm0, (%edi) |
1747 | # endif |
1748 | # ifdef DATA_CACHE_SIZE_HALF |
1749 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1750 | # else |
1751 | # ifdef PIC |
1752 | SETUP_PIC_REG(bx) |
1753 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1754 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1755 | # else |
1756 | cmp __x86_data_cache_size_half, %ecx |
1757 | # endif |
1758 | # endif |
1759 | jb L(sh_14_no_prefetch) |
1760 | |
1761 | lea -64(%ecx), %ecx |
1762 | |
1763 | .p2align 4 |
1764 | L(Shl14LoopStart): |
1765 | prefetcht0 0x1c0(%eax) |
1766 | prefetcht0 0x1c0(%edx) |
1767 | movaps 2(%eax), %xmm2 |
1768 | movaps 18(%eax), %xmm3 |
1769 | movaps 34(%eax), %xmm4 |
1770 | movaps 50(%eax), %xmm5 |
1771 | movaps %xmm5, %xmm7 |
1772 | palignr $14, %xmm4, %xmm5 |
1773 | palignr $14, %xmm3, %xmm4 |
1774 | movaps %xmm5, 48(%edx) |
1775 | palignr $14, %xmm2, %xmm3 |
1776 | lea 64(%eax), %eax |
1777 | palignr $14, %xmm1, %xmm2 |
1778 | movaps %xmm4, 32(%edx) |
1779 | movaps %xmm3, 16(%edx) |
1780 | movaps %xmm7, %xmm1 |
1781 | movaps %xmm2, (%edx) |
1782 | lea 64(%edx), %edx |
1783 | sub $64, %ecx |
1784 | ja L(Shl14LoopStart) |
1785 | |
1786 | L(Shl14LoopLeave): |
1787 | add $32, %ecx |
1788 | jle L(shl_end_0) |
1789 | |
1790 | movaps 2(%eax), %xmm2 |
1791 | movaps 18(%eax), %xmm3 |
1792 | palignr $14, %xmm2, %xmm3 |
1793 | palignr $14, %xmm1, %xmm2 |
1794 | |
1795 | movaps %xmm2, (%edx) |
1796 | movaps %xmm3, 16(%edx) |
1797 | lea 32(%edx, %ecx), %edx |
1798 | lea 32(%eax, %ecx), %eax |
1799 | POP (%edi) |
1800 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1801 | |
1802 | CFI_PUSH (%edi) |
1803 | |
1804 | .p2align 4 |
1805 | L(sh_14_no_prefetch): |
1806 | lea -32(%ecx), %ecx |
1807 | lea -14(%eax), %eax |
1808 | xor %edi, %edi |
1809 | |
1810 | .p2align 4 |
1811 | L(sh_14_no_prefetch_loop): |
1812 | movdqa 16(%eax, %edi), %xmm2 |
1813 | sub $32, %ecx |
1814 | movdqa 32(%eax, %edi), %xmm3 |
1815 | movdqa %xmm3, %xmm4 |
1816 | palignr $14, %xmm2, %xmm3 |
1817 | palignr $14, %xmm1, %xmm2 |
1818 | lea 32(%edi), %edi |
1819 | movdqa %xmm2, -32(%edx, %edi) |
1820 | movdqa %xmm3, -16(%edx, %edi) |
1821 | jb L(sh_14_end_no_prefetch_loop) |
1822 | |
1823 | movdqa 16(%eax, %edi), %xmm2 |
1824 | sub $32, %ecx |
1825 | movdqa 32(%eax, %edi), %xmm3 |
1826 | movdqa %xmm3, %xmm1 |
1827 | palignr $14, %xmm2, %xmm3 |
1828 | palignr $14, %xmm4, %xmm2 |
1829 | lea 32(%edi), %edi |
1830 | movdqa %xmm2, -32(%edx, %edi) |
1831 | movdqa %xmm3, -16(%edx, %edi) |
1832 | jae L(sh_14_no_prefetch_loop) |
1833 | |
1834 | L(sh_14_end_no_prefetch_loop): |
1835 | lea 32(%ecx), %ecx |
1836 | add %ecx, %edi |
1837 | add %edi, %edx |
1838 | lea 14(%edi, %eax), %eax |
1839 | POP (%edi) |
1840 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1841 | |
1842 | CFI_PUSH (%edi) |
1843 | |
1844 | .p2align 4 |
1845 | L(shl_15): |
1846 | # ifndef USE_AS_MEMMOVE |
1847 | movaps -15(%eax), %xmm1 |
1848 | # else |
1849 | movl DEST+4(%esp), %edi |
1850 | movaps -15(%eax), %xmm1 |
1851 | movdqu %xmm0, (%edi) |
1852 | # endif |
1853 | # ifdef DATA_CACHE_SIZE_HALF |
1854 | cmp $DATA_CACHE_SIZE_HALF, %ecx |
1855 | # else |
1856 | # ifdef PIC |
1857 | SETUP_PIC_REG(bx) |
1858 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
1859 | cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx |
1860 | # else |
1861 | cmp __x86_data_cache_size_half, %ecx |
1862 | # endif |
1863 | # endif |
1864 | jb L(sh_15_no_prefetch) |
1865 | |
1866 | lea -64(%ecx), %ecx |
1867 | |
1868 | .p2align 4 |
1869 | L(Shl15LoopStart): |
1870 | prefetcht0 0x1c0(%eax) |
1871 | prefetcht0 0x1c0(%edx) |
1872 | movaps 1(%eax), %xmm2 |
1873 | movaps 17(%eax), %xmm3 |
1874 | movaps 33(%eax), %xmm4 |
1875 | movaps 49(%eax), %xmm5 |
1876 | movaps %xmm5, %xmm7 |
1877 | palignr $15, %xmm4, %xmm5 |
1878 | palignr $15, %xmm3, %xmm4 |
1879 | movaps %xmm5, 48(%edx) |
1880 | palignr $15, %xmm2, %xmm3 |
1881 | lea 64(%eax), %eax |
1882 | palignr $15, %xmm1, %xmm2 |
1883 | movaps %xmm4, 32(%edx) |
1884 | movaps %xmm3, 16(%edx) |
1885 | movaps %xmm7, %xmm1 |
1886 | movaps %xmm2, (%edx) |
1887 | lea 64(%edx), %edx |
1888 | sub $64, %ecx |
1889 | ja L(Shl15LoopStart) |
1890 | |
1891 | L(Shl15LoopLeave): |
1892 | add $32, %ecx |
1893 | jle L(shl_end_0) |
1894 | |
1895 | movaps 1(%eax), %xmm2 |
1896 | movaps 17(%eax), %xmm3 |
1897 | palignr $15, %xmm2, %xmm3 |
1898 | palignr $15, %xmm1, %xmm2 |
1899 | |
1900 | movaps %xmm2, (%edx) |
1901 | movaps %xmm3, 16(%edx) |
1902 | lea 32(%edx, %ecx), %edx |
1903 | lea 32(%eax, %ecx), %eax |
1904 | POP (%edi) |
1905 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1906 | |
1907 | CFI_PUSH (%edi) |
1908 | |
1909 | .p2align 4 |
1910 | L(sh_15_no_prefetch): |
1911 | lea -32(%ecx), %ecx |
1912 | lea -15(%eax), %eax |
1913 | xor %edi, %edi |
1914 | |
1915 | .p2align 4 |
1916 | L(sh_15_no_prefetch_loop): |
1917 | movdqa 16(%eax, %edi), %xmm2 |
1918 | sub $32, %ecx |
1919 | movdqa 32(%eax, %edi), %xmm3 |
1920 | movdqa %xmm3, %xmm4 |
1921 | palignr $15, %xmm2, %xmm3 |
1922 | palignr $15, %xmm1, %xmm2 |
1923 | lea 32(%edi), %edi |
1924 | movdqa %xmm2, -32(%edx, %edi) |
1925 | movdqa %xmm3, -16(%edx, %edi) |
1926 | jb L(sh_15_end_no_prefetch_loop) |
1927 | |
1928 | movdqa 16(%eax, %edi), %xmm2 |
1929 | sub $32, %ecx |
1930 | movdqa 32(%eax, %edi), %xmm3 |
1931 | movdqa %xmm3, %xmm1 |
1932 | palignr $15, %xmm2, %xmm3 |
1933 | palignr $15, %xmm4, %xmm2 |
1934 | lea 32(%edi), %edi |
1935 | movdqa %xmm2, -32(%edx, %edi) |
1936 | movdqa %xmm3, -16(%edx, %edi) |
1937 | jae L(sh_15_no_prefetch_loop) |
1938 | |
1939 | L(sh_15_end_no_prefetch_loop): |
1940 | lea 32(%ecx), %ecx |
1941 | add %ecx, %edi |
1942 | add %edi, %edx |
1943 | lea 15(%edi, %eax), %eax |
1944 | POP (%edi) |
1945 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1946 | |
1947 | CFI_PUSH (%edi) |
1948 | |
1949 | .p2align 4 |
1950 | L(shl_end_0): |
1951 | lea 32(%ecx), %ecx |
1952 | lea (%edx, %ecx), %edx |
1953 | lea (%eax, %ecx), %eax |
1954 | POP (%edi) |
1955 | BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) |
1956 | |
1957 | .p2align 4 |
1958 | L(fwd_write_44bytes): |
1959 | movq -44(%eax), %xmm0 |
1960 | movq %xmm0, -44(%edx) |
1961 | L(fwd_write_36bytes): |
1962 | movq -36(%eax), %xmm0 |
1963 | movq %xmm0, -36(%edx) |
1964 | L(fwd_write_28bytes): |
1965 | movq -28(%eax), %xmm0 |
1966 | movq %xmm0, -28(%edx) |
1967 | L(fwd_write_20bytes): |
1968 | movq -20(%eax), %xmm0 |
1969 | movq %xmm0, -20(%edx) |
1970 | L(fwd_write_12bytes): |
1971 | movq -12(%eax), %xmm0 |
1972 | movq %xmm0, -12(%edx) |
1973 | L(fwd_write_4bytes): |
1974 | movl -4(%eax), %ecx |
1975 | movl %ecx, -4(%edx) |
1976 | # ifdef USE_AS_MEMPCPY |
1977 | movl %edx, %eax |
1978 | # else |
1979 | movl DEST(%esp), %eax |
1980 | # endif |
1981 | RETURN |
1982 | |
1983 | .p2align 4 |
1984 | L(fwd_write_40bytes): |
1985 | movq -40(%eax), %xmm0 |
1986 | movq %xmm0, -40(%edx) |
1987 | L(fwd_write_32bytes): |
1988 | movq -32(%eax), %xmm0 |
1989 | movq %xmm0, -32(%edx) |
1990 | L(fwd_write_24bytes): |
1991 | movq -24(%eax), %xmm0 |
1992 | movq %xmm0, -24(%edx) |
1993 | L(fwd_write_16bytes): |
1994 | movq -16(%eax), %xmm0 |
1995 | movq %xmm0, -16(%edx) |
1996 | L(fwd_write_8bytes): |
1997 | movq -8(%eax), %xmm0 |
1998 | movq %xmm0, -8(%edx) |
1999 | L(fwd_write_0bytes): |
2000 | # ifdef USE_AS_MEMPCPY |
2001 | movl %edx, %eax |
2002 | # else |
2003 | movl DEST(%esp), %eax |
2004 | # endif |
2005 | RETURN |
2006 | |
2007 | .p2align 4 |
2008 | L(fwd_write_5bytes): |
2009 | movl -5(%eax), %ecx |
2010 | movl -4(%eax), %eax |
2011 | movl %ecx, -5(%edx) |
2012 | movl %eax, -4(%edx) |
2013 | # ifdef USE_AS_MEMPCPY |
2014 | movl %edx, %eax |
2015 | # else |
2016 | movl DEST(%esp), %eax |
2017 | # endif |
2018 | RETURN |
2019 | |
2020 | .p2align 4 |
2021 | L(fwd_write_45bytes): |
2022 | movq -45(%eax), %xmm0 |
2023 | movq %xmm0, -45(%edx) |
2024 | L(fwd_write_37bytes): |
2025 | movq -37(%eax), %xmm0 |
2026 | movq %xmm0, -37(%edx) |
2027 | L(fwd_write_29bytes): |
2028 | movq -29(%eax), %xmm0 |
2029 | movq %xmm0, -29(%edx) |
2030 | L(fwd_write_21bytes): |
2031 | movq -21(%eax), %xmm0 |
2032 | movq %xmm0, -21(%edx) |
2033 | L(fwd_write_13bytes): |
2034 | movq -13(%eax), %xmm0 |
2035 | movq %xmm0, -13(%edx) |
2036 | movl -5(%eax), %ecx |
2037 | movl %ecx, -5(%edx) |
2038 | movzbl -1(%eax), %ecx |
2039 | movb %cl, -1(%edx) |
2040 | # ifdef USE_AS_MEMPCPY |
2041 | movl %edx, %eax |
2042 | # else |
2043 | movl DEST(%esp), %eax |
2044 | # endif |
2045 | RETURN |
2046 | |
2047 | .p2align 4 |
2048 | L(fwd_write_41bytes): |
2049 | movq -41(%eax), %xmm0 |
2050 | movq %xmm0, -41(%edx) |
2051 | L(fwd_write_33bytes): |
2052 | movq -33(%eax), %xmm0 |
2053 | movq %xmm0, -33(%edx) |
2054 | L(fwd_write_25bytes): |
2055 | movq -25(%eax), %xmm0 |
2056 | movq %xmm0, -25(%edx) |
2057 | L(fwd_write_17bytes): |
2058 | movq -17(%eax), %xmm0 |
2059 | movq %xmm0, -17(%edx) |
2060 | L(fwd_write_9bytes): |
2061 | movq -9(%eax), %xmm0 |
2062 | movq %xmm0, -9(%edx) |
2063 | L(fwd_write_1bytes): |
2064 | movzbl -1(%eax), %ecx |
2065 | movb %cl, -1(%edx) |
2066 | # ifdef USE_AS_MEMPCPY |
2067 | movl %edx, %eax |
2068 | # else |
2069 | movl DEST(%esp), %eax |
2070 | # endif |
2071 | RETURN |
2072 | |
2073 | .p2align 4 |
2074 | L(fwd_write_46bytes): |
2075 | movq -46(%eax), %xmm0 |
2076 | movq %xmm0, -46(%edx) |
2077 | L(fwd_write_38bytes): |
2078 | movq -38(%eax), %xmm0 |
2079 | movq %xmm0, -38(%edx) |
2080 | L(fwd_write_30bytes): |
2081 | movq -30(%eax), %xmm0 |
2082 | movq %xmm0, -30(%edx) |
2083 | L(fwd_write_22bytes): |
2084 | movq -22(%eax), %xmm0 |
2085 | movq %xmm0, -22(%edx) |
2086 | L(fwd_write_14bytes): |
2087 | movq -14(%eax), %xmm0 |
2088 | movq %xmm0, -14(%edx) |
2089 | L(fwd_write_6bytes): |
2090 | movl -6(%eax), %ecx |
2091 | movl %ecx, -6(%edx) |
2092 | movzwl -2(%eax), %ecx |
2093 | movw %cx, -2(%edx) |
2094 | # ifdef USE_AS_MEMPCPY |
2095 | movl %edx, %eax |
2096 | # else |
2097 | movl DEST(%esp), %eax |
2098 | # endif |
2099 | RETURN |
2100 | |
2101 | .p2align 4 |
2102 | L(fwd_write_42bytes): |
2103 | movq -42(%eax), %xmm0 |
2104 | movq %xmm0, -42(%edx) |
2105 | L(fwd_write_34bytes): |
2106 | movq -34(%eax), %xmm0 |
2107 | movq %xmm0, -34(%edx) |
2108 | L(fwd_write_26bytes): |
2109 | movq -26(%eax), %xmm0 |
2110 | movq %xmm0, -26(%edx) |
2111 | L(fwd_write_18bytes): |
2112 | movq -18(%eax), %xmm0 |
2113 | movq %xmm0, -18(%edx) |
2114 | L(fwd_write_10bytes): |
2115 | movq -10(%eax), %xmm0 |
2116 | movq %xmm0, -10(%edx) |
2117 | L(fwd_write_2bytes): |
2118 | movzwl -2(%eax), %ecx |
2119 | movw %cx, -2(%edx) |
2120 | # ifdef USE_AS_MEMPCPY |
2121 | movl %edx, %eax |
2122 | # else |
2123 | movl DEST(%esp), %eax |
2124 | # endif |
2125 | RETURN |
2126 | |
2127 | .p2align 4 |
2128 | L(fwd_write_47bytes): |
2129 | movq -47(%eax), %xmm0 |
2130 | movq %xmm0, -47(%edx) |
2131 | L(fwd_write_39bytes): |
2132 | movq -39(%eax), %xmm0 |
2133 | movq %xmm0, -39(%edx) |
2134 | L(fwd_write_31bytes): |
2135 | movq -31(%eax), %xmm0 |
2136 | movq %xmm0, -31(%edx) |
2137 | L(fwd_write_23bytes): |
2138 | movq -23(%eax), %xmm0 |
2139 | movq %xmm0, -23(%edx) |
2140 | L(fwd_write_15bytes): |
2141 | movq -15(%eax), %xmm0 |
2142 | movq %xmm0, -15(%edx) |
2143 | L(fwd_write_7bytes): |
2144 | movl -7(%eax), %ecx |
2145 | movl %ecx, -7(%edx) |
2146 | movzwl -3(%eax), %ecx |
2147 | movzbl -1(%eax), %eax |
2148 | movw %cx, -3(%edx) |
2149 | movb %al, -1(%edx) |
2150 | # ifdef USE_AS_MEMPCPY |
2151 | movl %edx, %eax |
2152 | # else |
2153 | movl DEST(%esp), %eax |
2154 | # endif |
2155 | RETURN |
2156 | |
2157 | .p2align 4 |
2158 | L(fwd_write_43bytes): |
2159 | movq -43(%eax), %xmm0 |
2160 | movq %xmm0, -43(%edx) |
2161 | L(fwd_write_35bytes): |
2162 | movq -35(%eax), %xmm0 |
2163 | movq %xmm0, -35(%edx) |
2164 | L(fwd_write_27bytes): |
2165 | movq -27(%eax), %xmm0 |
2166 | movq %xmm0, -27(%edx) |
2167 | L(fwd_write_19bytes): |
2168 | movq -19(%eax), %xmm0 |
2169 | movq %xmm0, -19(%edx) |
2170 | L(fwd_write_11bytes): |
2171 | movq -11(%eax), %xmm0 |
2172 | movq %xmm0, -11(%edx) |
2173 | L(fwd_write_3bytes): |
2174 | movzwl -3(%eax), %ecx |
2175 | movzbl -1(%eax), %eax |
2176 | movw %cx, -3(%edx) |
2177 | movb %al, -1(%edx) |
2178 | # ifdef USE_AS_MEMPCPY |
2179 | movl %edx, %eax |
2180 | # else |
2181 | movl DEST(%esp), %eax |
2182 | # endif |
2183 | RETURN |
2184 | |
2185 | .p2align 4 |
2186 | L(fwd_write_40bytes_align): |
2187 | movdqa -40(%eax), %xmm0 |
2188 | movdqa %xmm0, -40(%edx) |
2189 | L(fwd_write_24bytes_align): |
2190 | movdqa -24(%eax), %xmm0 |
2191 | movdqa %xmm0, -24(%edx) |
2192 | L(fwd_write_8bytes_align): |
2193 | movq -8(%eax), %xmm0 |
2194 | movq %xmm0, -8(%edx) |
2195 | L(fwd_write_0bytes_align): |
2196 | # ifdef USE_AS_MEMPCPY |
2197 | movl %edx, %eax |
2198 | # else |
2199 | movl DEST(%esp), %eax |
2200 | # endif |
2201 | RETURN |
2202 | |
2203 | .p2align 4 |
2204 | L(fwd_write_32bytes_align): |
2205 | movdqa -32(%eax), %xmm0 |
2206 | movdqa %xmm0, -32(%edx) |
2207 | L(fwd_write_16bytes_align): |
2208 | movdqa -16(%eax), %xmm0 |
2209 | movdqa %xmm0, -16(%edx) |
2210 | # ifdef USE_AS_MEMPCPY |
2211 | movl %edx, %eax |
2212 | # else |
2213 | movl DEST(%esp), %eax |
2214 | # endif |
2215 | RETURN |
2216 | |
2217 | .p2align 4 |
2218 | L(fwd_write_5bytes_align): |
2219 | movl -5(%eax), %ecx |
2220 | movl -4(%eax), %eax |
2221 | movl %ecx, -5(%edx) |
2222 | movl %eax, -4(%edx) |
2223 | # ifdef USE_AS_MEMPCPY |
2224 | movl %edx, %eax |
2225 | # else |
2226 | movl DEST(%esp), %eax |
2227 | # endif |
2228 | RETURN |
2229 | |
2230 | .p2align 4 |
2231 | L(fwd_write_45bytes_align): |
2232 | movdqa -45(%eax), %xmm0 |
2233 | movdqa %xmm0, -45(%edx) |
2234 | L(fwd_write_29bytes_align): |
2235 | movdqa -29(%eax), %xmm0 |
2236 | movdqa %xmm0, -29(%edx) |
2237 | L(fwd_write_13bytes_align): |
2238 | movq -13(%eax), %xmm0 |
2239 | movq %xmm0, -13(%edx) |
2240 | movl -5(%eax), %ecx |
2241 | movl %ecx, -5(%edx) |
2242 | movzbl -1(%eax), %ecx |
2243 | movb %cl, -1(%edx) |
2244 | # ifdef USE_AS_MEMPCPY |
2245 | movl %edx, %eax |
2246 | # else |
2247 | movl DEST(%esp), %eax |
2248 | # endif |
2249 | RETURN |
2250 | |
2251 | .p2align 4 |
2252 | L(fwd_write_37bytes_align): |
2253 | movdqa -37(%eax), %xmm0 |
2254 | movdqa %xmm0, -37(%edx) |
2255 | L(fwd_write_21bytes_align): |
2256 | movdqa -21(%eax), %xmm0 |
2257 | movdqa %xmm0, -21(%edx) |
2258 | movl -5(%eax), %ecx |
2259 | movl %ecx, -5(%edx) |
2260 | movzbl -1(%eax), %ecx |
2261 | movb %cl, -1(%edx) |
2262 | # ifdef USE_AS_MEMPCPY |
2263 | movl %edx, %eax |
2264 | # else |
2265 | movl DEST(%esp), %eax |
2266 | # endif |
2267 | RETURN |
2268 | |
2269 | .p2align 4 |
2270 | L(fwd_write_41bytes_align): |
2271 | movdqa -41(%eax), %xmm0 |
2272 | movdqa %xmm0, -41(%edx) |
2273 | L(fwd_write_25bytes_align): |
2274 | movdqa -25(%eax), %xmm0 |
2275 | movdqa %xmm0, -25(%edx) |
2276 | L(fwd_write_9bytes_align): |
2277 | movq -9(%eax), %xmm0 |
2278 | movq %xmm0, -9(%edx) |
2279 | L(fwd_write_1bytes_align): |
2280 | movzbl -1(%eax), %ecx |
2281 | movb %cl, -1(%edx) |
2282 | # ifdef USE_AS_MEMPCPY |
2283 | movl %edx, %eax |
2284 | # else |
2285 | movl DEST(%esp), %eax |
2286 | # endif |
2287 | RETURN |
2288 | |
2289 | .p2align 4 |
2290 | L(fwd_write_33bytes_align): |
2291 | movdqa -33(%eax), %xmm0 |
2292 | movdqa %xmm0, -33(%edx) |
2293 | L(fwd_write_17bytes_align): |
2294 | movdqa -17(%eax), %xmm0 |
2295 | movdqa %xmm0, -17(%edx) |
2296 | movzbl -1(%eax), %ecx |
2297 | movb %cl, -1(%edx) |
2298 | # ifdef USE_AS_MEMPCPY |
2299 | movl %edx, %eax |
2300 | # else |
2301 | movl DEST(%esp), %eax |
2302 | # endif |
2303 | RETURN |
2304 | |
2305 | .p2align 4 |
2306 | L(fwd_write_46bytes_align): |
2307 | movdqa -46(%eax), %xmm0 |
2308 | movdqa %xmm0, -46(%edx) |
2309 | L(fwd_write_30bytes_align): |
2310 | movdqa -30(%eax), %xmm0 |
2311 | movdqa %xmm0, -30(%edx) |
2312 | L(fwd_write_14bytes_align): |
2313 | movq -14(%eax), %xmm0 |
2314 | movq %xmm0, -14(%edx) |
2315 | L(fwd_write_6bytes_align): |
2316 | movl -6(%eax), %ecx |
2317 | movl %ecx, -6(%edx) |
2318 | movzwl -2(%eax), %ecx |
2319 | movw %cx, -2(%edx) |
2320 | # ifdef USE_AS_MEMPCPY |
2321 | movl %edx, %eax |
2322 | # else |
2323 | movl DEST(%esp), %eax |
2324 | # endif |
2325 | RETURN |
2326 | |
2327 | .p2align 4 |
2328 | L(fwd_write_38bytes_align): |
2329 | movdqa -38(%eax), %xmm0 |
2330 | movdqa %xmm0, -38(%edx) |
2331 | L(fwd_write_22bytes_align): |
2332 | movdqa -22(%eax), %xmm0 |
2333 | movdqa %xmm0, -22(%edx) |
2334 | movl -6(%eax), %ecx |
2335 | movl %ecx, -6(%edx) |
2336 | movzwl -2(%eax), %ecx |
2337 | movw %cx, -2(%edx) |
2338 | # ifdef USE_AS_MEMPCPY |
2339 | movl %edx, %eax |
2340 | # else |
2341 | movl DEST(%esp), %eax |
2342 | # endif |
2343 | RETURN |
2344 | |
2345 | .p2align 4 |
2346 | L(fwd_write_42bytes_align): |
2347 | movdqa -42(%eax), %xmm0 |
2348 | movdqa %xmm0, -42(%edx) |
2349 | L(fwd_write_26bytes_align): |
2350 | movdqa -26(%eax), %xmm0 |
2351 | movdqa %xmm0, -26(%edx) |
2352 | L(fwd_write_10bytes_align): |
2353 | movq -10(%eax), %xmm0 |
2354 | movq %xmm0, -10(%edx) |
2355 | L(fwd_write_2bytes_align): |
2356 | movzwl -2(%eax), %ecx |
2357 | movw %cx, -2(%edx) |
2358 | # ifdef USE_AS_MEMPCPY |
2359 | movl %edx, %eax |
2360 | # else |
2361 | movl DEST(%esp), %eax |
2362 | # endif |
2363 | RETURN |
2364 | |
2365 | .p2align 4 |
2366 | L(fwd_write_34bytes_align): |
2367 | movdqa -34(%eax), %xmm0 |
2368 | movdqa %xmm0, -34(%edx) |
2369 | L(fwd_write_18bytes_align): |
2370 | movdqa -18(%eax), %xmm0 |
2371 | movdqa %xmm0, -18(%edx) |
2372 | movzwl -2(%eax), %ecx |
2373 | movw %cx, -2(%edx) |
2374 | # ifdef USE_AS_MEMPCPY |
2375 | movl %edx, %eax |
2376 | # else |
2377 | movl DEST(%esp), %eax |
2378 | # endif |
2379 | RETURN |
2380 | |
2381 | .p2align 4 |
2382 | L(fwd_write_47bytes_align): |
2383 | movdqa -47(%eax), %xmm0 |
2384 | movdqa %xmm0, -47(%edx) |
2385 | L(fwd_write_31bytes_align): |
2386 | movdqa -31(%eax), %xmm0 |
2387 | movdqa %xmm0, -31(%edx) |
2388 | L(fwd_write_15bytes_align): |
2389 | movq -15(%eax), %xmm0 |
2390 | movq %xmm0, -15(%edx) |
2391 | L(fwd_write_7bytes_align): |
2392 | movl -7(%eax), %ecx |
2393 | movl %ecx, -7(%edx) |
2394 | movzwl -3(%eax), %ecx |
2395 | movzbl -1(%eax), %eax |
2396 | movw %cx, -3(%edx) |
2397 | movb %al, -1(%edx) |
2398 | # ifdef USE_AS_MEMPCPY |
2399 | movl %edx, %eax |
2400 | # else |
2401 | movl DEST(%esp), %eax |
2402 | # endif |
2403 | RETURN |
2404 | |
2405 | .p2align 4 |
2406 | L(fwd_write_39bytes_align): |
2407 | movdqa -39(%eax), %xmm0 |
2408 | movdqa %xmm0, -39(%edx) |
2409 | L(fwd_write_23bytes_align): |
2410 | movdqa -23(%eax), %xmm0 |
2411 | movdqa %xmm0, -23(%edx) |
2412 | movl -7(%eax), %ecx |
2413 | movl %ecx, -7(%edx) |
2414 | movzwl -3(%eax), %ecx |
2415 | movzbl -1(%eax), %eax |
2416 | movw %cx, -3(%edx) |
2417 | movb %al, -1(%edx) |
2418 | # ifdef USE_AS_MEMPCPY |
2419 | movl %edx, %eax |
2420 | # else |
2421 | movl DEST(%esp), %eax |
2422 | # endif |
2423 | RETURN |
2424 | |
2425 | .p2align 4 |
2426 | L(fwd_write_43bytes_align): |
2427 | movdqa -43(%eax), %xmm0 |
2428 | movdqa %xmm0, -43(%edx) |
2429 | L(fwd_write_27bytes_align): |
2430 | movdqa -27(%eax), %xmm0 |
2431 | movdqa %xmm0, -27(%edx) |
2432 | L(fwd_write_11bytes_align): |
2433 | movq -11(%eax), %xmm0 |
2434 | movq %xmm0, -11(%edx) |
2435 | L(fwd_write_3bytes_align): |
2436 | movzwl -3(%eax), %ecx |
2437 | movzbl -1(%eax), %eax |
2438 | movw %cx, -3(%edx) |
2439 | movb %al, -1(%edx) |
2440 | # ifdef USE_AS_MEMPCPY |
2441 | movl %edx, %eax |
2442 | # else |
2443 | movl DEST(%esp), %eax |
2444 | # endif |
2445 | RETURN |
2446 | |
2447 | .p2align 4 |
2448 | L(fwd_write_35bytes_align): |
2449 | movdqa -35(%eax), %xmm0 |
2450 | movdqa %xmm0, -35(%edx) |
2451 | L(fwd_write_19bytes_align): |
2452 | movdqa -19(%eax), %xmm0 |
2453 | movdqa %xmm0, -19(%edx) |
2454 | movzwl -3(%eax), %ecx |
2455 | movzbl -1(%eax), %eax |
2456 | movw %cx, -3(%edx) |
2457 | movb %al, -1(%edx) |
2458 | # ifdef USE_AS_MEMPCPY |
2459 | movl %edx, %eax |
2460 | # else |
2461 | movl DEST(%esp), %eax |
2462 | # endif |
2463 | RETURN |
2464 | |
2465 | .p2align 4 |
2466 | L(fwd_write_44bytes_align): |
2467 | movdqa -44(%eax), %xmm0 |
2468 | movdqa %xmm0, -44(%edx) |
2469 | L(fwd_write_28bytes_align): |
2470 | movdqa -28(%eax), %xmm0 |
2471 | movdqa %xmm0, -28(%edx) |
2472 | L(fwd_write_12bytes_align): |
2473 | movq -12(%eax), %xmm0 |
2474 | movq %xmm0, -12(%edx) |
2475 | L(fwd_write_4bytes_align): |
2476 | movl -4(%eax), %ecx |
2477 | movl %ecx, -4(%edx) |
2478 | # ifdef USE_AS_MEMPCPY |
2479 | movl %edx, %eax |
2480 | # else |
2481 | movl DEST(%esp), %eax |
2482 | # endif |
2483 | RETURN |
2484 | |
2485 | .p2align 4 |
2486 | L(fwd_write_36bytes_align): |
2487 | movdqa -36(%eax), %xmm0 |
2488 | movdqa %xmm0, -36(%edx) |
2489 | L(fwd_write_20bytes_align): |
2490 | movdqa -20(%eax), %xmm0 |
2491 | movdqa %xmm0, -20(%edx) |
2492 | movl -4(%eax), %ecx |
2493 | movl %ecx, -4(%edx) |
2494 | # ifdef USE_AS_MEMPCPY |
2495 | movl %edx, %eax |
2496 | # else |
2497 | movl DEST(%esp), %eax |
2498 | # endif |
2499 | RETURN_END |
2500 | |
2501 | CFI_PUSH (%edi) |
2502 | |
2503 | .p2align 4 |
2504 | L(large_page): |
2505 | movdqu (%eax), %xmm1 |
2506 | # ifdef USE_AS_MEMMOVE |
2507 | movl DEST+4(%esp), %edi |
2508 | movdqu %xmm0, (%edi) |
2509 | # endif |
2510 | lea 16(%eax), %eax |
2511 | movntdq %xmm1, (%edx) |
2512 | lea 16(%edx), %edx |
2513 | lea -0x90(%ecx), %ecx |
2514 | POP (%edi) |
2515 | |
2516 | .p2align 4 |
2517 | L(large_page_loop): |
2518 | movdqu (%eax), %xmm0 |
2519 | movdqu 0x10(%eax), %xmm1 |
2520 | movdqu 0x20(%eax), %xmm2 |
2521 | movdqu 0x30(%eax), %xmm3 |
2522 | movdqu 0x40(%eax), %xmm4 |
2523 | movdqu 0x50(%eax), %xmm5 |
2524 | movdqu 0x60(%eax), %xmm6 |
2525 | movdqu 0x70(%eax), %xmm7 |
2526 | lea 0x80(%eax), %eax |
2527 | |
2528 | sub $0x80, %ecx |
2529 | movntdq %xmm0, (%edx) |
2530 | movntdq %xmm1, 0x10(%edx) |
2531 | movntdq %xmm2, 0x20(%edx) |
2532 | movntdq %xmm3, 0x30(%edx) |
2533 | movntdq %xmm4, 0x40(%edx) |
2534 | movntdq %xmm5, 0x50(%edx) |
2535 | movntdq %xmm6, 0x60(%edx) |
2536 | movntdq %xmm7, 0x70(%edx) |
2537 | lea 0x80(%edx), %edx |
2538 | jae L(large_page_loop) |
2539 | cmp $-0x40, %ecx |
2540 | lea 0x80(%ecx), %ecx |
2541 | jl L(large_page_less_64bytes) |
2542 | |
2543 | movdqu (%eax), %xmm0 |
2544 | movdqu 0x10(%eax), %xmm1 |
2545 | movdqu 0x20(%eax), %xmm2 |
2546 | movdqu 0x30(%eax), %xmm3 |
2547 | lea 0x40(%eax), %eax |
2548 | |
2549 | movntdq %xmm0, (%edx) |
2550 | movntdq %xmm1, 0x10(%edx) |
2551 | movntdq %xmm2, 0x20(%edx) |
2552 | movntdq %xmm3, 0x30(%edx) |
2553 | lea 0x40(%edx), %edx |
2554 | sub $0x40, %ecx |
2555 | L(large_page_less_64bytes): |
2556 | cmp $32, %ecx |
2557 | jb L(large_page_less_32bytes) |
2558 | movdqu (%eax), %xmm0 |
2559 | movdqu 0x10(%eax), %xmm1 |
2560 | lea 0x20(%eax), %eax |
2561 | movntdq %xmm0, (%edx) |
2562 | movntdq %xmm1, 0x10(%edx) |
2563 | lea 0x20(%edx), %edx |
2564 | sub $0x20, %ecx |
2565 | L(large_page_less_32bytes): |
2566 | add %ecx, %edx |
2567 | add %ecx, %eax |
2568 | sfence |
2569 | BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) |
2570 | |
2571 | .p2align 4 |
2572 | L(bk_write_44bytes): |
2573 | movq 36(%eax), %xmm0 |
2574 | movq %xmm0, 36(%edx) |
2575 | L(bk_write_36bytes): |
2576 | movq 28(%eax), %xmm0 |
2577 | movq %xmm0, 28(%edx) |
2578 | L(bk_write_28bytes): |
2579 | movq 20(%eax), %xmm0 |
2580 | movq %xmm0, 20(%edx) |
2581 | L(bk_write_20bytes): |
2582 | movq 12(%eax), %xmm0 |
2583 | movq %xmm0, 12(%edx) |
2584 | L(bk_write_12bytes): |
2585 | movq 4(%eax), %xmm0 |
2586 | movq %xmm0, 4(%edx) |
2587 | L(bk_write_4bytes): |
2588 | movl (%eax), %ecx |
2589 | movl %ecx, (%edx) |
2590 | L(bk_write_0bytes): |
2591 | movl DEST(%esp), %eax |
2592 | # ifdef USE_AS_MEMPCPY |
2593 | movl LEN(%esp), %ecx |
2594 | add %ecx, %eax |
2595 | # endif |
2596 | RETURN |
2597 | |
2598 | .p2align 4 |
2599 | L(bk_write_40bytes): |
2600 | movq 32(%eax), %xmm0 |
2601 | movq %xmm0, 32(%edx) |
2602 | L(bk_write_32bytes): |
2603 | movq 24(%eax), %xmm0 |
2604 | movq %xmm0, 24(%edx) |
2605 | L(bk_write_24bytes): |
2606 | movq 16(%eax), %xmm0 |
2607 | movq %xmm0, 16(%edx) |
2608 | L(bk_write_16bytes): |
2609 | movq 8(%eax), %xmm0 |
2610 | movq %xmm0, 8(%edx) |
2611 | L(bk_write_8bytes): |
2612 | movq (%eax), %xmm0 |
2613 | movq %xmm0, (%edx) |
2614 | movl DEST(%esp), %eax |
2615 | # ifdef USE_AS_MEMPCPY |
2616 | movl LEN(%esp), %ecx |
2617 | add %ecx, %eax |
2618 | # endif |
2619 | RETURN |
2620 | |
2621 | .p2align 4 |
2622 | L(bk_write_45bytes): |
2623 | movq 37(%eax), %xmm0 |
2624 | movq %xmm0, 37(%edx) |
2625 | L(bk_write_37bytes): |
2626 | movq 29(%eax), %xmm0 |
2627 | movq %xmm0, 29(%edx) |
2628 | L(bk_write_29bytes): |
2629 | movq 21(%eax), %xmm0 |
2630 | movq %xmm0, 21(%edx) |
2631 | L(bk_write_21bytes): |
2632 | movq 13(%eax), %xmm0 |
2633 | movq %xmm0, 13(%edx) |
2634 | L(bk_write_13bytes): |
2635 | movq 5(%eax), %xmm0 |
2636 | movq %xmm0, 5(%edx) |
2637 | L(bk_write_5bytes): |
2638 | movl 1(%eax), %ecx |
2639 | movl %ecx, 1(%edx) |
2640 | L(bk_write_1bytes): |
2641 | movzbl (%eax), %ecx |
2642 | movb %cl, (%edx) |
2643 | movl DEST(%esp), %eax |
2644 | # ifdef USE_AS_MEMPCPY |
2645 | movl LEN(%esp), %ecx |
2646 | add %ecx, %eax |
2647 | # endif |
2648 | RETURN |
2649 | |
2650 | .p2align 4 |
2651 | L(bk_write_41bytes): |
2652 | movq 33(%eax), %xmm0 |
2653 | movq %xmm0, 33(%edx) |
2654 | L(bk_write_33bytes): |
2655 | movq 25(%eax), %xmm0 |
2656 | movq %xmm0, 25(%edx) |
2657 | L(bk_write_25bytes): |
2658 | movq 17(%eax), %xmm0 |
2659 | movq %xmm0, 17(%edx) |
2660 | L(bk_write_17bytes): |
2661 | movq 9(%eax), %xmm0 |
2662 | movq %xmm0, 9(%edx) |
2663 | L(bk_write_9bytes): |
2664 | movq 1(%eax), %xmm0 |
2665 | movq %xmm0, 1(%edx) |
2666 | movzbl (%eax), %ecx |
2667 | movb %cl, (%edx) |
2668 | movl DEST(%esp), %eax |
2669 | # ifdef USE_AS_MEMPCPY |
2670 | movl LEN(%esp), %ecx |
2671 | add %ecx, %eax |
2672 | # endif |
2673 | RETURN |
2674 | |
2675 | .p2align 4 |
2676 | L(bk_write_46bytes): |
2677 | movq 38(%eax), %xmm0 |
2678 | movq %xmm0, 38(%edx) |
2679 | L(bk_write_38bytes): |
2680 | movq 30(%eax), %xmm0 |
2681 | movq %xmm0, 30(%edx) |
2682 | L(bk_write_30bytes): |
2683 | movq 22(%eax), %xmm0 |
2684 | movq %xmm0, 22(%edx) |
2685 | L(bk_write_22bytes): |
2686 | movq 14(%eax), %xmm0 |
2687 | movq %xmm0, 14(%edx) |
2688 | L(bk_write_14bytes): |
2689 | movq 6(%eax), %xmm0 |
2690 | movq %xmm0, 6(%edx) |
2691 | L(bk_write_6bytes): |
2692 | movl 2(%eax), %ecx |
2693 | movl %ecx, 2(%edx) |
2694 | movzwl (%eax), %ecx |
2695 | movw %cx, (%edx) |
2696 | movl DEST(%esp), %eax |
2697 | # ifdef USE_AS_MEMPCPY |
2698 | movl LEN(%esp), %ecx |
2699 | add %ecx, %eax |
2700 | # endif |
2701 | RETURN |
2702 | |
2703 | .p2align 4 |
2704 | L(bk_write_42bytes): |
2705 | movq 34(%eax), %xmm0 |
2706 | movq %xmm0, 34(%edx) |
2707 | L(bk_write_34bytes): |
2708 | movq 26(%eax), %xmm0 |
2709 | movq %xmm0, 26(%edx) |
2710 | L(bk_write_26bytes): |
2711 | movq 18(%eax), %xmm0 |
2712 | movq %xmm0, 18(%edx) |
2713 | L(bk_write_18bytes): |
2714 | movq 10(%eax), %xmm0 |
2715 | movq %xmm0, 10(%edx) |
2716 | L(bk_write_10bytes): |
2717 | movq 2(%eax), %xmm0 |
2718 | movq %xmm0, 2(%edx) |
2719 | L(bk_write_2bytes): |
2720 | movzwl (%eax), %ecx |
2721 | movw %cx, (%edx) |
2722 | movl DEST(%esp), %eax |
2723 | # ifdef USE_AS_MEMPCPY |
2724 | movl LEN(%esp), %ecx |
2725 | add %ecx, %eax |
2726 | # endif |
2727 | RETURN |
2728 | |
2729 | .p2align 4 |
2730 | L(bk_write_47bytes): |
2731 | movq 39(%eax), %xmm0 |
2732 | movq %xmm0, 39(%edx) |
2733 | L(bk_write_39bytes): |
2734 | movq 31(%eax), %xmm0 |
2735 | movq %xmm0, 31(%edx) |
2736 | L(bk_write_31bytes): |
2737 | movq 23(%eax), %xmm0 |
2738 | movq %xmm0, 23(%edx) |
2739 | L(bk_write_23bytes): |
2740 | movq 15(%eax), %xmm0 |
2741 | movq %xmm0, 15(%edx) |
2742 | L(bk_write_15bytes): |
2743 | movq 7(%eax), %xmm0 |
2744 | movq %xmm0, 7(%edx) |
2745 | L(bk_write_7bytes): |
2746 | movl 3(%eax), %ecx |
2747 | movl %ecx, 3(%edx) |
2748 | movzwl 1(%eax), %ecx |
2749 | movw %cx, 1(%edx) |
2750 | movzbl (%eax), %eax |
2751 | movb %al, (%edx) |
2752 | movl DEST(%esp), %eax |
2753 | # ifdef USE_AS_MEMPCPY |
2754 | movl LEN(%esp), %ecx |
2755 | add %ecx, %eax |
2756 | # endif |
2757 | RETURN |
2758 | |
2759 | .p2align 4 |
2760 | L(bk_write_43bytes): |
2761 | movq 35(%eax), %xmm0 |
2762 | movq %xmm0, 35(%edx) |
2763 | L(bk_write_35bytes): |
2764 | movq 27(%eax), %xmm0 |
2765 | movq %xmm0, 27(%edx) |
2766 | L(bk_write_27bytes): |
2767 | movq 19(%eax), %xmm0 |
2768 | movq %xmm0, 19(%edx) |
2769 | L(bk_write_19bytes): |
2770 | movq 11(%eax), %xmm0 |
2771 | movq %xmm0, 11(%edx) |
2772 | L(bk_write_11bytes): |
2773 | movq 3(%eax), %xmm0 |
2774 | movq %xmm0, 3(%edx) |
2775 | L(bk_write_3bytes): |
2776 | movzwl 1(%eax), %ecx |
2777 | movw %cx, 1(%edx) |
2778 | movzbl (%eax), %eax |
2779 | movb %al, (%edx) |
2780 | movl DEST(%esp), %eax |
2781 | # ifdef USE_AS_MEMPCPY |
2782 | movl LEN(%esp), %ecx |
2783 | add %ecx, %eax |
2784 | # endif |
2785 | RETURN_END |
2786 | |
2787 | |
2788 | .pushsection .rodata.ssse3,"a" ,@progbits |
2789 | .p2align 2 |
2790 | L(table_48bytes_fwd): |
2791 | .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) |
2792 | .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) |
2793 | .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) |
2794 | .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) |
2795 | .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) |
2796 | .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) |
2797 | .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) |
2798 | .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) |
2799 | .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) |
2800 | .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) |
2801 | .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) |
2802 | .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) |
2803 | .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) |
2804 | .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) |
2805 | .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) |
2806 | .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) |
2807 | .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) |
2808 | .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) |
2809 | .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) |
2810 | .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) |
2811 | .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) |
2812 | .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) |
2813 | .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) |
2814 | .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) |
2815 | .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) |
2816 | .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) |
2817 | .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) |
2818 | .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) |
2819 | .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) |
2820 | .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) |
2821 | .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) |
2822 | .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) |
2823 | .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) |
2824 | .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) |
2825 | .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) |
2826 | .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) |
2827 | .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) |
2828 | .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) |
2829 | .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) |
2830 | .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) |
2831 | .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) |
2832 | .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) |
2833 | .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) |
2834 | .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) |
2835 | .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) |
2836 | .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) |
2837 | .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) |
2838 | .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) |
2839 | |
2840 | .p2align 2 |
2841 | L(table_48bytes_fwd_align): |
2842 | .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) |
2843 | .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) |
2844 | .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) |
2845 | .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) |
2846 | .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) |
2847 | .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) |
2848 | .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) |
2849 | .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) |
2850 | .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) |
2851 | .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) |
2852 | .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) |
2853 | .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) |
2854 | .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) |
2855 | .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) |
2856 | .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) |
2857 | .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) |
2858 | .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) |
2859 | .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) |
2860 | .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) |
2861 | .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) |
2862 | .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) |
2863 | .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) |
2864 | .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) |
2865 | .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) |
2866 | .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) |
2867 | .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) |
2868 | .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) |
2869 | .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) |
2870 | .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) |
2871 | .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) |
2872 | .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) |
2873 | .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) |
2874 | .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) |
2875 | .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) |
2876 | .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) |
2877 | .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) |
2878 | .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) |
2879 | .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) |
2880 | .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) |
2881 | .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) |
2882 | .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) |
2883 | .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) |
2884 | .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) |
2885 | .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) |
2886 | .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) |
2887 | .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) |
2888 | .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) |
2889 | .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) |
2890 | |
2891 | .p2align 2 |
2892 | L(shl_table): |
2893 | .int JMPTBL (L(shl_0), L(shl_table)) |
2894 | .int JMPTBL (L(shl_1), L(shl_table)) |
2895 | .int JMPTBL (L(shl_2), L(shl_table)) |
2896 | .int JMPTBL (L(shl_3), L(shl_table)) |
2897 | .int JMPTBL (L(shl_4), L(shl_table)) |
2898 | .int JMPTBL (L(shl_5), L(shl_table)) |
2899 | .int JMPTBL (L(shl_6), L(shl_table)) |
2900 | .int JMPTBL (L(shl_7), L(shl_table)) |
2901 | .int JMPTBL (L(shl_8), L(shl_table)) |
2902 | .int JMPTBL (L(shl_9), L(shl_table)) |
2903 | .int JMPTBL (L(shl_10), L(shl_table)) |
2904 | .int JMPTBL (L(shl_11), L(shl_table)) |
2905 | .int JMPTBL (L(shl_12), L(shl_table)) |
2906 | .int JMPTBL (L(shl_13), L(shl_table)) |
2907 | .int JMPTBL (L(shl_14), L(shl_table)) |
2908 | .int JMPTBL (L(shl_15), L(shl_table)) |
2909 | |
2910 | .p2align 2 |
2911 | L(table_48_bytes_bwd): |
2912 | .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) |
2913 | .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) |
2914 | .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) |
2915 | .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) |
2916 | .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) |
2917 | .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) |
2918 | .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) |
2919 | .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) |
2920 | .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) |
2921 | .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) |
2922 | .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) |
2923 | .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) |
2924 | .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) |
2925 | .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) |
2926 | .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) |
2927 | .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) |
2928 | .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) |
2929 | .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) |
2930 | .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) |
2931 | .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) |
2932 | .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) |
2933 | .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) |
2934 | .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) |
2935 | .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) |
2936 | .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) |
2937 | .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) |
2938 | .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) |
2939 | .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) |
2940 | .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) |
2941 | .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) |
2942 | .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) |
2943 | .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) |
2944 | .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) |
2945 | .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) |
2946 | .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) |
2947 | .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) |
2948 | .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) |
2949 | .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) |
2950 | .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) |
2951 | .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) |
2952 | .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) |
2953 | .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) |
2954 | .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) |
2955 | .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) |
2956 | .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) |
2957 | .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) |
2958 | .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) |
2959 | .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) |
2960 | |
2961 | .popsection |
2962 | |
2963 | # ifdef USE_AS_MEMMOVE |
2964 | .p2align 4 |
2965 | L(copy_backward): |
2966 | PUSH (%edi) |
2967 | movl %eax, %edi |
2968 | lea (%ecx,%edx,1),%edx |
2969 | lea (%ecx,%edi,1),%edi |
2970 | testl $0x3, %edx |
2971 | jnz L(bk_align) |
2972 | |
2973 | L(bk_aligned_4): |
2974 | cmp $64, %ecx |
2975 | jae L(bk_write_more64bytes) |
2976 | |
2977 | L(bk_write_64bytesless): |
2978 | cmp $32, %ecx |
2979 | jb L(bk_write_less32bytes) |
2980 | |
2981 | L(bk_write_more32bytes): |
2982 | /* Copy 32 bytes at a time. */ |
2983 | sub $32, %ecx |
2984 | movq -8(%edi), %xmm0 |
2985 | movq %xmm0, -8(%edx) |
2986 | movq -16(%edi), %xmm0 |
2987 | movq %xmm0, -16(%edx) |
2988 | movq -24(%edi), %xmm0 |
2989 | movq %xmm0, -24(%edx) |
2990 | movq -32(%edi), %xmm0 |
2991 | movq %xmm0, -32(%edx) |
2992 | sub $32, %edx |
2993 | sub $32, %edi |
2994 | |
2995 | L(bk_write_less32bytes): |
2996 | movl %edi, %eax |
2997 | sub %ecx, %edx |
2998 | sub %ecx, %eax |
2999 | POP (%edi) |
3000 | L(bk_write_less32bytes_2): |
3001 | BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) |
3002 | |
3003 | CFI_PUSH (%edi) |
3004 | |
3005 | .p2align 4 |
3006 | L(bk_align): |
3007 | cmp $8, %ecx |
3008 | jbe L(bk_write_less32bytes) |
3009 | testl $1, %edx |
3010 | /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, |
3011 | then (EDX & 2) must be != 0. */ |
3012 | jz L(bk_got2) |
3013 | sub $1, %edi |
3014 | sub $1, %ecx |
3015 | sub $1, %edx |
3016 | movzbl (%edi), %eax |
3017 | movb %al, (%edx) |
3018 | |
3019 | testl $2, %edx |
3020 | jz L(bk_aligned_4) |
3021 | |
3022 | L(bk_got2): |
3023 | sub $2, %edi |
3024 | sub $2, %ecx |
3025 | sub $2, %edx |
3026 | movzwl (%edi), %eax |
3027 | movw %ax, (%edx) |
3028 | jmp L(bk_aligned_4) |
3029 | |
3030 | .p2align 4 |
3031 | L(bk_write_more64bytes): |
3032 | /* Check alignment of last byte. */ |
3033 | testl $15, %edx |
3034 | jz L(bk_ssse3_cpy_pre) |
3035 | |
3036 | /* EDX is aligned 4 bytes, but not 16 bytes. */ |
3037 | L(bk_ssse3_align): |
3038 | sub $4, %edi |
3039 | sub $4, %ecx |
3040 | sub $4, %edx |
3041 | movl (%edi), %eax |
3042 | movl %eax, (%edx) |
3043 | |
3044 | testl $15, %edx |
3045 | jz L(bk_ssse3_cpy_pre) |
3046 | |
3047 | sub $4, %edi |
3048 | sub $4, %ecx |
3049 | sub $4, %edx |
3050 | movl (%edi), %eax |
3051 | movl %eax, (%edx) |
3052 | |
3053 | testl $15, %edx |
3054 | jz L(bk_ssse3_cpy_pre) |
3055 | |
3056 | sub $4, %edi |
3057 | sub $4, %ecx |
3058 | sub $4, %edx |
3059 | movl (%edi), %eax |
3060 | movl %eax, (%edx) |
3061 | |
3062 | L(bk_ssse3_cpy_pre): |
3063 | cmp $64, %ecx |
3064 | jb L(bk_write_more32bytes) |
3065 | |
3066 | .p2align 4 |
3067 | L(bk_ssse3_cpy): |
3068 | sub $64, %edi |
3069 | sub $64, %ecx |
3070 | sub $64, %edx |
3071 | movdqu 0x30(%edi), %xmm3 |
3072 | movdqa %xmm3, 0x30(%edx) |
3073 | movdqu 0x20(%edi), %xmm2 |
3074 | movdqa %xmm2, 0x20(%edx) |
3075 | movdqu 0x10(%edi), %xmm1 |
3076 | movdqa %xmm1, 0x10(%edx) |
3077 | movdqu (%edi), %xmm0 |
3078 | movdqa %xmm0, (%edx) |
3079 | cmp $64, %ecx |
3080 | jae L(bk_ssse3_cpy) |
3081 | jmp L(bk_write_64bytesless) |
3082 | |
3083 | # endif |
3084 | |
3085 | END (MEMCPY) |
3086 | |
3087 | #endif |
3088 | |