1 | /* memcpy optimized with SSE2 unaligned memory access instructions. |
2 | Copyright (C) 2014-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) \ |
20 | && (defined SHARED \ |
21 | || defined USE_AS_MEMMOVE \ |
22 | || !defined USE_MULTIARCH) |
23 | |
24 | # include <sysdep.h> |
25 | # include "asm-syntax.h" |
26 | |
27 | # ifndef MEMCPY |
28 | # define MEMCPY __memcpy_sse2_unaligned |
29 | # define MEMCPY_CHK __memcpy_chk_sse2_unaligned |
30 | # endif |
31 | |
32 | # define DEST PARMS |
33 | # define SRC DEST+4 |
34 | # define LEN SRC+4 |
35 | |
36 | # define CFI_PUSH(REG) \ |
37 | cfi_adjust_cfa_offset (4); \ |
38 | cfi_rel_offset (REG, 0) |
39 | |
40 | # define CFI_POP(REG) \ |
41 | cfi_adjust_cfa_offset (-4); \ |
42 | cfi_restore (REG) |
43 | |
44 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
45 | # define POP(REG) popl REG; CFI_POP (REG) |
46 | |
47 | # define PARMS 8 /* Preserve EBX. */ |
48 | # define ENTRANCE PUSH (%ebx); |
49 | # define RETURN_END POP (%ebx); ret |
50 | # define RETURN RETURN_END; CFI_PUSH (%ebx) |
51 | |
52 | .section .text.sse2,"ax" ,@progbits |
53 | # if defined SHARED |
54 | ENTRY (MEMCPY_CHK) |
55 | movl 12(%esp), %eax |
56 | cmpl %eax, 16(%esp) |
57 | jb HIDDEN_JUMPTARGET (__chk_fail) |
58 | END (MEMCPY_CHK) |
59 | # endif |
60 | |
61 | ENTRY (MEMCPY) |
62 | ENTRANCE |
63 | movl LEN(%esp), %ecx |
64 | movl SRC(%esp), %eax |
65 | movl DEST(%esp), %edx |
66 | cmp %edx, %eax |
67 | |
68 | # ifdef USE_AS_MEMMOVE |
69 | ja L(check_forward) |
70 | |
71 | L(mm_len_0_or_more_backward): |
72 | /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] |
73 | separately. */ |
74 | cmp $16, %ecx |
75 | jbe L(mm_len_0_16_bytes_backward) |
76 | |
77 | cmpl $32, %ecx |
78 | ja L(mm_len_32_or_more_backward) |
79 | |
80 | /* Copy [0..32] and return. */ |
81 | movdqu (%eax), %xmm0 |
82 | movdqu -16(%eax, %ecx), %xmm1 |
83 | movdqu %xmm0, (%edx) |
84 | movdqu %xmm1, -16(%edx, %ecx) |
85 | jmp L(return) |
86 | |
87 | L(mm_len_32_or_more_backward): |
88 | cmpl $64, %ecx |
89 | ja L(mm_len_64_or_more_backward) |
90 | |
91 | /* Copy [0..64] and return. */ |
92 | movdqu (%eax), %xmm0 |
93 | movdqu 16(%eax), %xmm1 |
94 | movdqu -16(%eax, %ecx), %xmm2 |
95 | movdqu -32(%eax, %ecx), %xmm3 |
96 | movdqu %xmm0, (%edx) |
97 | movdqu %xmm1, 16(%edx) |
98 | movdqu %xmm2, -16(%edx, %ecx) |
99 | movdqu %xmm3, -32(%edx, %ecx) |
100 | jmp L(return) |
101 | |
102 | L(mm_len_64_or_more_backward): |
103 | cmpl $128, %ecx |
104 | ja L(mm_len_128_or_more_backward) |
105 | |
106 | /* Copy [0..128] and return. */ |
107 | movdqu (%eax), %xmm0 |
108 | movdqu 16(%eax), %xmm1 |
109 | movdqu 32(%eax), %xmm2 |
110 | movdqu 48(%eax), %xmm3 |
111 | movdqu -64(%eax, %ecx), %xmm4 |
112 | movdqu -48(%eax, %ecx), %xmm5 |
113 | movdqu -32(%eax, %ecx), %xmm6 |
114 | movdqu -16(%eax, %ecx), %xmm7 |
115 | movdqu %xmm0, (%edx) |
116 | movdqu %xmm1, 16(%edx) |
117 | movdqu %xmm2, 32(%edx) |
118 | movdqu %xmm3, 48(%edx) |
119 | movdqu %xmm4, -64(%edx, %ecx) |
120 | movdqu %xmm5, -48(%edx, %ecx) |
121 | movdqu %xmm6, -32(%edx, %ecx) |
122 | movdqu %xmm7, -16(%edx, %ecx) |
123 | jmp L(return) |
124 | |
125 | L(mm_len_128_or_more_backward): |
126 | add %ecx, %eax |
127 | cmp %edx, %eax |
128 | movl SRC(%esp), %eax |
129 | jbe L(forward) |
130 | PUSH (%esi) |
131 | PUSH (%edi) |
132 | PUSH (%ebx) |
133 | |
134 | /* Aligning the address of destination. */ |
135 | movdqu (%eax), %xmm4 |
136 | movdqu 16(%eax), %xmm5 |
137 | movdqu 32(%eax), %xmm6 |
138 | movdqu 48(%eax), %xmm7 |
139 | leal (%edx, %ecx), %esi |
140 | movdqu -16(%eax, %ecx), %xmm0 |
141 | subl $16, %esp |
142 | movdqu %xmm0, (%esp) |
143 | mov %ecx, %edi |
144 | movl %esi, %ecx |
145 | andl $-16, %ecx |
146 | leal (%ecx), %ebx |
147 | subl %edx, %ebx |
148 | leal (%eax, %ebx), %eax |
149 | shrl $6, %ebx |
150 | |
151 | # ifdef SHARED_CACHE_SIZE_HALF |
152 | cmp $SHARED_CACHE_SIZE_HALF, %edi |
153 | # else |
154 | # ifdef PIC |
155 | PUSH (%ebx) |
156 | SETUP_PIC_REG (bx) |
157 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
158 | cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi |
159 | POP (%ebx) |
160 | # else |
161 | cmp __x86_shared_cache_size_half, %edi |
162 | # endif |
163 | # endif |
164 | jae L(mm_large_page_loop_backward) |
165 | |
166 | .p2align 4 |
167 | L(mm_main_loop_backward): |
168 | |
169 | prefetcht0 -128(%eax) |
170 | |
171 | movdqu -64(%eax), %xmm0 |
172 | movdqu -48(%eax), %xmm1 |
173 | movdqu -32(%eax), %xmm2 |
174 | movdqu -16(%eax), %xmm3 |
175 | movaps %xmm0, -64(%ecx) |
176 | subl $64, %eax |
177 | movaps %xmm1, -48(%ecx) |
178 | movaps %xmm2, -32(%ecx) |
179 | movaps %xmm3, -16(%ecx) |
180 | subl $64, %ecx |
181 | sub $1, %ebx |
182 | jnz L(mm_main_loop_backward) |
183 | movdqu (%esp), %xmm0 |
184 | addl $16, %esp |
185 | movdqu %xmm0, -16(%esi) |
186 | movdqu %xmm4, (%edx) |
187 | movdqu %xmm5, 16(%edx) |
188 | movdqu %xmm6, 32(%edx) |
189 | movdqu %xmm7, 48(%edx) |
190 | POP (%ebx) |
191 | jmp L(mm_return_pop_all) |
192 | |
193 | /* Copy [0..16] and return. */ |
194 | L(mm_len_0_16_bytes_backward): |
195 | testb $24, %cl |
196 | jnz L(mm_len_9_16_bytes_backward) |
197 | testb $4, %cl |
198 | .p2align 4,,5 |
199 | jnz L(mm_len_5_8_bytes_backward) |
200 | testl %ecx, %ecx |
201 | .p2align 4,,2 |
202 | je L(return) |
203 | testb $2, %cl |
204 | .p2align 4,,1 |
205 | jne L(mm_len_3_4_bytes_backward) |
206 | movzbl -1(%eax,%ecx), %ebx |
207 | movzbl (%eax), %eax |
208 | movb %bl, -1(%edx,%ecx) |
209 | movb %al, (%edx) |
210 | jmp L(return) |
211 | |
212 | L(mm_len_3_4_bytes_backward): |
213 | movzwl -2(%eax,%ecx), %ebx |
214 | movzwl (%eax), %eax |
215 | movw %bx, -2(%edx,%ecx) |
216 | movw %ax, (%edx) |
217 | jmp L(return) |
218 | |
219 | L(mm_len_9_16_bytes_backward): |
220 | PUSH (%esi) |
221 | movl -4(%eax,%ecx), %ebx |
222 | movl -8(%eax,%ecx), %esi |
223 | movl %ebx, -4(%edx,%ecx) |
224 | movl %esi, -8(%edx,%ecx) |
225 | subl $8, %ecx |
226 | POP (%esi) |
227 | jmp L(mm_len_0_16_bytes_backward) |
228 | |
229 | L(mm_len_5_8_bytes_backward): |
230 | movl (%eax), %ebx |
231 | movl -4(%eax,%ecx), %eax |
232 | movl %ebx, (%edx) |
233 | movl %eax, -4(%edx,%ecx) |
234 | jmp L(return) |
235 | |
236 | /* Big length copy backward part. */ |
237 | .p2align 4 |
238 | L(mm_large_page_loop_backward): |
239 | movdqu -64(%eax), %xmm0 |
240 | movdqu -48(%eax), %xmm1 |
241 | movdqu -32(%eax), %xmm2 |
242 | movdqu -16(%eax), %xmm3 |
243 | movntdq %xmm0, -64(%ecx) |
244 | subl $64, %eax |
245 | movntdq %xmm1, -48(%ecx) |
246 | movntdq %xmm2, -32(%ecx) |
247 | movntdq %xmm3, -16(%ecx) |
248 | subl $64, %ecx |
249 | sub $1, %ebx |
250 | jnz L(mm_large_page_loop_backward) |
251 | sfence |
252 | movdqu (%esp), %xmm0 |
253 | addl $16, %esp |
254 | movdqu %xmm0, -16(%esi) |
255 | movdqu %xmm4, (%edx) |
256 | movdqu %xmm5, 16(%edx) |
257 | movdqu %xmm6, 32(%edx) |
258 | movdqu %xmm7, 48(%edx) |
259 | POP (%ebx) |
260 | jmp L(mm_return_pop_all) |
261 | |
262 | L(check_forward): |
263 | add %edx, %ecx |
264 | cmp %eax, %ecx |
265 | movl LEN(%esp), %ecx |
266 | jbe L(forward) |
267 | |
268 | /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] |
269 | separately. */ |
270 | cmp $16, %ecx |
271 | jbe L(mm_len_0_16_bytes_forward) |
272 | |
273 | cmpl $32, %ecx |
274 | ja L(mm_len_32_or_more_forward) |
275 | |
276 | /* Copy [0..32] and return. */ |
277 | movdqu (%eax), %xmm0 |
278 | movdqu -16(%eax, %ecx), %xmm1 |
279 | movdqu %xmm0, (%edx) |
280 | movdqu %xmm1, -16(%edx, %ecx) |
281 | jmp L(return) |
282 | |
283 | L(mm_len_32_or_more_forward): |
284 | cmpl $64, %ecx |
285 | ja L(mm_len_64_or_more_forward) |
286 | |
287 | /* Copy [0..64] and return. */ |
288 | movdqu (%eax), %xmm0 |
289 | movdqu 16(%eax), %xmm1 |
290 | movdqu -16(%eax, %ecx), %xmm2 |
291 | movdqu -32(%eax, %ecx), %xmm3 |
292 | movdqu %xmm0, (%edx) |
293 | movdqu %xmm1, 16(%edx) |
294 | movdqu %xmm2, -16(%edx, %ecx) |
295 | movdqu %xmm3, -32(%edx, %ecx) |
296 | jmp L(return) |
297 | |
298 | L(mm_len_64_or_more_forward): |
299 | cmpl $128, %ecx |
300 | ja L(mm_len_128_or_more_forward) |
301 | |
302 | /* Copy [0..128] and return. */ |
303 | movdqu (%eax), %xmm0 |
304 | movdqu 16(%eax), %xmm1 |
305 | movdqu 32(%eax), %xmm2 |
306 | movdqu 48(%eax), %xmm3 |
307 | movdqu -64(%eax, %ecx), %xmm4 |
308 | movdqu -48(%eax, %ecx), %xmm5 |
309 | movdqu -32(%eax, %ecx), %xmm6 |
310 | movdqu -16(%eax, %ecx), %xmm7 |
311 | movdqu %xmm0, (%edx) |
312 | movdqu %xmm1, 16(%edx) |
313 | movdqu %xmm2, 32(%edx) |
314 | movdqu %xmm3, 48(%edx) |
315 | movdqu %xmm4, -64(%edx, %ecx) |
316 | movdqu %xmm5, -48(%edx, %ecx) |
317 | movdqu %xmm6, -32(%edx, %ecx) |
318 | movdqu %xmm7, -16(%edx, %ecx) |
319 | jmp L(return) |
320 | |
321 | L(mm_len_128_or_more_forward): |
322 | PUSH (%esi) |
323 | PUSH (%edi) |
324 | PUSH (%ebx) |
325 | |
326 | /* Aligning the address of destination. */ |
327 | movdqu -16(%eax, %ecx), %xmm4 |
328 | movdqu -32(%eax, %ecx), %xmm5 |
329 | movdqu -48(%eax, %ecx), %xmm6 |
330 | movdqu -64(%eax, %ecx), %xmm7 |
331 | leal (%edx, %ecx), %esi |
332 | movdqu (%eax), %xmm0 |
333 | subl $16, %esp |
334 | movdqu %xmm0, (%esp) |
335 | mov %ecx, %edi |
336 | leal 16(%edx), %ecx |
337 | andl $-16, %ecx |
338 | movl %ecx, %ebx |
339 | subl %edx, %ebx |
340 | addl %ebx, %eax |
341 | movl %esi, %ebx |
342 | subl %ecx, %ebx |
343 | shrl $6, %ebx |
344 | |
345 | # ifdef SHARED_CACHE_SIZE_HALF |
346 | cmp $SHARED_CACHE_SIZE_HALF, %edi |
347 | # else |
348 | # ifdef PIC |
349 | PUSH (%ebx) |
350 | SETUP_PIC_REG(bx) |
351 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
352 | cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi |
353 | POP (%ebx) |
354 | # else |
355 | cmp __x86_shared_cache_size_half, %edi |
356 | # endif |
357 | # endif |
358 | jae L(mm_large_page_loop_forward) |
359 | |
360 | .p2align 4 |
361 | L(mm_main_loop_forward): |
362 | |
363 | prefetcht0 128(%eax) |
364 | |
365 | movdqu (%eax), %xmm0 |
366 | movdqu 16(%eax), %xmm1 |
367 | movdqu 32(%eax), %xmm2 |
368 | movdqu 48(%eax), %xmm3 |
369 | movdqa %xmm0, (%ecx) |
370 | addl $64, %eax |
371 | movaps %xmm1, 16(%ecx) |
372 | movaps %xmm2, 32(%ecx) |
373 | movaps %xmm3, 48(%ecx) |
374 | addl $64, %ecx |
375 | sub $1, %ebx |
376 | jnz L(mm_main_loop_forward) |
377 | movdqu (%esp), %xmm0 |
378 | addl $16, %esp |
379 | movdqu %xmm0, (%edx) |
380 | movdqu %xmm4, -16(%esi) |
381 | movdqu %xmm5, -32(%esi) |
382 | movdqu %xmm6, -48(%esi) |
383 | movdqu %xmm7, -64(%esi) |
384 | POP (%ebx) |
385 | jmp L(mm_return_pop_all) |
386 | |
387 | L(mm_len_0_16_bytes_forward): |
388 | testb $24, %cl |
389 | jne L(mm_len_9_16_bytes_forward) |
390 | testb $4, %cl |
391 | .p2align 4,,5 |
392 | jne L(mm_len_5_8_bytes_forward) |
393 | testl %ecx, %ecx |
394 | .p2align 4,,2 |
395 | je L(return) |
396 | testb $2, %cl |
397 | .p2align 4,,1 |
398 | jne L(mm_len_2_4_bytes_forward) |
399 | movzbl -1(%eax,%ecx), %ebx |
400 | movzbl (%eax), %eax |
401 | movb %bl, -1(%edx,%ecx) |
402 | movb %al, (%edx) |
403 | jmp L(return) |
404 | |
405 | L(mm_len_2_4_bytes_forward): |
406 | movzwl -2(%eax,%ecx), %ebx |
407 | movzwl (%eax), %eax |
408 | movw %bx, -2(%edx,%ecx) |
409 | movw %ax, (%edx) |
410 | jmp L(return) |
411 | |
412 | L(mm_len_5_8_bytes_forward): |
413 | movl (%eax), %ebx |
414 | movl -4(%eax,%ecx), %eax |
415 | movl %ebx, (%edx) |
416 | movl %eax, -4(%edx,%ecx) |
417 | jmp L(return) |
418 | |
419 | L(mm_len_9_16_bytes_forward): |
420 | movq (%eax), %xmm0 |
421 | movq -8(%eax, %ecx), %xmm1 |
422 | movq %xmm0, (%edx) |
423 | movq %xmm1, -8(%edx, %ecx) |
424 | jmp L(return) |
425 | |
426 | L(mm_return_pop_all): |
427 | movl %edx, %eax |
428 | POP (%edi) |
429 | POP (%esi) |
430 | RETURN |
431 | |
432 | /* Big length copy forward part. */ |
433 | .p2align 4 |
434 | L(mm_large_page_loop_forward): |
435 | movdqu (%eax), %xmm0 |
436 | movdqu 16(%eax), %xmm1 |
437 | movdqu 32(%eax), %xmm2 |
438 | movdqu 48(%eax), %xmm3 |
439 | movntdq %xmm0, (%ecx) |
440 | addl $64, %eax |
441 | movntdq %xmm1, 16(%ecx) |
442 | movntdq %xmm2, 32(%ecx) |
443 | movntdq %xmm3, 48(%ecx) |
444 | addl $64, %ecx |
445 | sub $1, %ebx |
446 | jnz L(mm_large_page_loop_forward) |
447 | sfence |
448 | movdqu (%esp), %xmm0 |
449 | addl $16, %esp |
450 | movdqu %xmm0, (%edx) |
451 | movdqu %xmm4, -16(%esi) |
452 | movdqu %xmm5, -32(%esi) |
453 | movdqu %xmm6, -48(%esi) |
454 | movdqu %xmm7, -64(%esi) |
455 | POP (%ebx) |
456 | jmp L(mm_return_pop_all) |
457 | # endif |
458 | |
459 | L(forward): |
460 | cmp $16, %ecx |
461 | jbe L(len_0_16_bytes) |
462 | |
463 | # ifdef SHARED_CACHE_SIZE_HALF |
464 | cmp $SHARED_CACHE_SIZE_HALF, %ecx |
465 | # else |
466 | # ifdef PIC |
467 | SETUP_PIC_REG(bx) |
468 | add $_GLOBAL_OFFSET_TABLE_, %ebx |
469 | cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx |
470 | # else |
471 | cmp __x86_shared_cache_size_half, %ecx |
472 | # endif |
473 | # endif |
474 | jae L(large_page) |
475 | |
476 | movdqu (%eax), %xmm0 |
477 | movdqu -16(%eax, %ecx), %xmm1 |
478 | cmpl $32, %ecx |
479 | movdqu %xmm0, (%edx) |
480 | movdqu %xmm1, -16(%edx, %ecx) |
481 | jbe L(return) |
482 | |
483 | movdqu 16(%eax), %xmm0 |
484 | movdqu -32(%eax, %ecx), %xmm1 |
485 | cmpl $64, %ecx |
486 | movdqu %xmm0, 16(%edx) |
487 | movdqu %xmm1, -32(%edx, %ecx) |
488 | jbe L(return) |
489 | |
490 | movdqu 32(%eax), %xmm0 |
491 | movdqu 48(%eax), %xmm1 |
492 | movdqu -48(%eax, %ecx), %xmm2 |
493 | movdqu -64(%eax, %ecx), %xmm3 |
494 | cmpl $128, %ecx |
495 | movdqu %xmm0, 32(%edx) |
496 | movdqu %xmm1, 48(%edx) |
497 | movdqu %xmm2, -48(%edx, %ecx) |
498 | movdqu %xmm3, -64(%edx, %ecx) |
499 | jbe L(return) |
500 | |
501 | /* Now the main loop: we align the address of the destination. */ |
502 | leal 64(%edx), %ebx |
503 | andl $-64, %ebx |
504 | |
505 | addl %edx, %ecx |
506 | andl $-64, %ecx |
507 | |
508 | subl %edx, %eax |
509 | |
510 | /* We should stop two iterations before the termination |
511 | (in order not to misprefetch). */ |
512 | subl $64, %ecx |
513 | cmpl %ebx, %ecx |
514 | je L(main_loop_just_one_iteration) |
515 | |
516 | subl $64, %ecx |
517 | cmpl %ebx, %ecx |
518 | je L(main_loop_last_two_iterations) |
519 | |
520 | .p2align 4 |
521 | L(main_loop_cache): |
522 | |
523 | prefetcht0 128(%ebx, %eax) |
524 | |
525 | movdqu (%ebx, %eax), %xmm0 |
526 | movdqu 16(%ebx, %eax), %xmm1 |
527 | movdqu 32(%ebx, %eax), %xmm2 |
528 | movdqu 48(%ebx, %eax), %xmm3 |
529 | movdqa %xmm0, (%ebx) |
530 | movaps %xmm1, 16(%ebx) |
531 | movaps %xmm2, 32(%ebx) |
532 | movaps %xmm3, 48(%ebx) |
533 | lea 64(%ebx), %ebx |
534 | cmpl %ebx, %ecx |
535 | jne L(main_loop_cache) |
536 | |
537 | L(main_loop_last_two_iterations): |
538 | movdqu (%ebx, %eax), %xmm0 |
539 | movdqu 16(%ebx, %eax), %xmm1 |
540 | movdqu 32(%ebx, %eax), %xmm2 |
541 | movdqu 48(%ebx, %eax), %xmm3 |
542 | movdqu 64(%ebx, %eax), %xmm4 |
543 | movdqu 80(%ebx, %eax), %xmm5 |
544 | movdqu 96(%ebx, %eax), %xmm6 |
545 | movdqu 112(%ebx, %eax), %xmm7 |
546 | movdqa %xmm0, (%ebx) |
547 | movaps %xmm1, 16(%ebx) |
548 | movaps %xmm2, 32(%ebx) |
549 | movaps %xmm3, 48(%ebx) |
550 | movaps %xmm4, 64(%ebx) |
551 | movaps %xmm5, 80(%ebx) |
552 | movaps %xmm6, 96(%ebx) |
553 | movaps %xmm7, 112(%ebx) |
554 | jmp L(return) |
555 | |
556 | L(main_loop_just_one_iteration): |
557 | movdqu (%ebx, %eax), %xmm0 |
558 | movdqu 16(%ebx, %eax), %xmm1 |
559 | movdqu 32(%ebx, %eax), %xmm2 |
560 | movdqu 48(%ebx, %eax), %xmm3 |
561 | movdqa %xmm0, (%ebx) |
562 | movaps %xmm1, 16(%ebx) |
563 | movaps %xmm2, 32(%ebx) |
564 | movaps %xmm3, 48(%ebx) |
565 | jmp L(return) |
566 | |
567 | L(large_page): |
568 | movdqu (%eax), %xmm0 |
569 | movdqu 16(%eax), %xmm1 |
570 | movdqu 32(%eax), %xmm2 |
571 | movdqu 48(%eax), %xmm3 |
572 | movdqu -64(%eax, %ecx), %xmm4 |
573 | movdqu -48(%eax, %ecx), %xmm5 |
574 | movdqu -32(%eax, %ecx), %xmm6 |
575 | movdqu -16(%eax, %ecx), %xmm7 |
576 | movdqu %xmm0, (%edx) |
577 | movdqu %xmm1, 16(%edx) |
578 | movdqu %xmm2, 32(%edx) |
579 | movdqu %xmm3, 48(%edx) |
580 | movdqu %xmm4, -64(%edx, %ecx) |
581 | movdqu %xmm5, -48(%edx, %ecx) |
582 | movdqu %xmm6, -32(%edx, %ecx) |
583 | movdqu %xmm7, -16(%edx, %ecx) |
584 | |
585 | movdqu 64(%eax), %xmm0 |
586 | movdqu 80(%eax), %xmm1 |
587 | movdqu 96(%eax), %xmm2 |
588 | movdqu 112(%eax), %xmm3 |
589 | movdqu -128(%eax, %ecx), %xmm4 |
590 | movdqu -112(%eax, %ecx), %xmm5 |
591 | movdqu -96(%eax, %ecx), %xmm6 |
592 | movdqu -80(%eax, %ecx), %xmm7 |
593 | movdqu %xmm0, 64(%edx) |
594 | movdqu %xmm1, 80(%edx) |
595 | movdqu %xmm2, 96(%edx) |
596 | movdqu %xmm3, 112(%edx) |
597 | movdqu %xmm4, -128(%edx, %ecx) |
598 | movdqu %xmm5, -112(%edx, %ecx) |
599 | movdqu %xmm6, -96(%edx, %ecx) |
600 | movdqu %xmm7, -80(%edx, %ecx) |
601 | |
602 | /* Now the main loop with non temporal stores. We align |
603 | the address of the destination. */ |
604 | leal 128(%edx), %ebx |
605 | andl $-128, %ebx |
606 | |
607 | addl %edx, %ecx |
608 | andl $-128, %ecx |
609 | |
610 | subl %edx, %eax |
611 | |
612 | .p2align 4 |
613 | L(main_loop_large_page): |
614 | movdqu (%ebx, %eax), %xmm0 |
615 | movdqu 16(%ebx, %eax), %xmm1 |
616 | movdqu 32(%ebx, %eax), %xmm2 |
617 | movdqu 48(%ebx, %eax), %xmm3 |
618 | movdqu 64(%ebx, %eax), %xmm4 |
619 | movdqu 80(%ebx, %eax), %xmm5 |
620 | movdqu 96(%ebx, %eax), %xmm6 |
621 | movdqu 112(%ebx, %eax), %xmm7 |
622 | movntdq %xmm0, (%ebx) |
623 | movntdq %xmm1, 16(%ebx) |
624 | movntdq %xmm2, 32(%ebx) |
625 | movntdq %xmm3, 48(%ebx) |
626 | movntdq %xmm4, 64(%ebx) |
627 | movntdq %xmm5, 80(%ebx) |
628 | movntdq %xmm6, 96(%ebx) |
629 | movntdq %xmm7, 112(%ebx) |
630 | lea 128(%ebx), %ebx |
631 | cmpl %ebx, %ecx |
632 | jne L(main_loop_large_page) |
633 | sfence |
634 | jmp L(return) |
635 | |
636 | L(len_0_16_bytes): |
637 | testb $24, %cl |
638 | jne L(len_9_16_bytes) |
639 | testb $4, %cl |
640 | .p2align 4,,5 |
641 | jne L(len_5_8_bytes) |
642 | testl %ecx, %ecx |
643 | .p2align 4,,2 |
644 | je L(return) |
645 | movzbl (%eax), %ebx |
646 | testb $2, %cl |
647 | movb %bl, (%edx) |
648 | je L(return) |
649 | movzwl -2(%eax,%ecx), %ebx |
650 | movw %bx, -2(%edx,%ecx) |
651 | jmp L(return) |
652 | |
653 | L(len_9_16_bytes): |
654 | movq (%eax), %xmm0 |
655 | movq -8(%eax, %ecx), %xmm1 |
656 | movq %xmm0, (%edx) |
657 | movq %xmm1, -8(%edx, %ecx) |
658 | jmp L(return) |
659 | |
660 | L(len_5_8_bytes): |
661 | movl (%eax), %ebx |
662 | movl %ebx, (%edx) |
663 | movl -4(%eax,%ecx), %ebx |
664 | movl %ebx, -4(%edx,%ecx) |
665 | |
666 | L(return): |
667 | movl %edx, %eax |
668 | # ifdef USE_AS_MEMPCPY |
669 | movl LEN(%esp), %ecx |
670 | add %ecx, %eax |
671 | # endif |
672 | RETURN |
673 | |
674 | END (MEMCPY) |
675 | #endif |
676 | |