memcpy-sse2-unaligned.S source code [glibc/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S]

1	/ memcpy optimized with SSE2 unaligned memory access instructions.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc) \
20	&& (defined SHARED \
21	\|\| defined USE_AS_MEMMOVE \
22	\|\| !defined USE_MULTIARCH)
23
24	# include <sysdep.h>
25	# include "asm-syntax.h"
26
27	# ifndef MEMCPY
28	# define MEMCPY __memcpy_sse2_unaligned
29	# define MEMCPY_CHK __memcpy_chk_sse2_unaligned
30	# endif
31
32	# define DEST PARMS
33	# define SRC DEST+4
34	# define LEN SRC+4
35
36	# define CFI_PUSH(REG) \
37	cfi_adjust_cfa_offset (4); \
38	cfi_rel_offset (REG, 0)
39
40	# define CFI_POP(REG) \
41	cfi_adjust_cfa_offset (-4); \
42	cfi_restore (REG)
43
44	# define PUSH(REG) pushl REG; CFI_PUSH (REG)
45	# define POP(REG) popl REG; CFI_POP (REG)
46
47	# define PARMS 8 /* Preserve EBX. */
48	# define ENTRANCE PUSH (%ebx);
49	# define RETURN_END POP (%ebx); ret
50	# define RETURN RETURN_END; CFI_PUSH (%ebx)
51
52	.section .text.sse2,"ax",@progbits
53	# if defined SHARED
54	ENTRY (MEMCPY_CHK)
55	movl `12`(%esp), %eax
56	cmpl %eax, `16`(%esp)
57	jb HIDDEN_JUMPTARGET (__chk_fail)
58	END (MEMCPY_CHK)
59	# endif
60
61	ENTRY (MEMCPY)
62	ENTRANCE
63	movl LEN(%esp), %ecx
64	movl SRC(%esp), %eax
65	movl DEST(%esp), %edx
66	cmp %edx, %eax
67
68	# ifdef USE_AS_MEMMOVE
69	ja L(check_forward)
70
71	L(mm_len_0_or_more_backward):
72	/ Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]*
73	separately. /*
74	cmp $`16`, %ecx
75	jbe L(mm_len_0_16_bytes_backward)
76
77	cmpl $`32`, %ecx
78	ja L(mm_len_32_or_more_backward)
79
80	/ Copy [0..32] and return. /
81	movdqu (%eax), %xmm0
82	movdqu -`16`(%eax, %ecx), %xmm1
83	movdqu %xmm0, (%edx)
84	movdqu %xmm1, -`16`(%edx, %ecx)
85	jmp L(return)
86
87	L(mm_len_32_or_more_backward):
88	cmpl $`64`, %ecx
89	ja L(mm_len_64_or_more_backward)
90
91	/ Copy [0..64] and return. /
92	movdqu (%eax), %xmm0
93	movdqu `16`(%eax), %xmm1
94	movdqu -`16`(%eax, %ecx), %xmm2
95	movdqu -`32`(%eax, %ecx), %xmm3
96	movdqu %xmm0, (%edx)
97	movdqu %xmm1, `16`(%edx)
98	movdqu %xmm2, -`16`(%edx, %ecx)
99	movdqu %xmm3, -`32`(%edx, %ecx)
100	jmp L(return)
101
102	L(mm_len_64_or_more_backward):
103	cmpl $`128`, %ecx
104	ja L(mm_len_128_or_more_backward)
105
106	/ Copy [0..128] and return. /
107	movdqu (%eax), %xmm0
108	movdqu `16`(%eax), %xmm1
109	movdqu `32`(%eax), %xmm2
110	movdqu `48`(%eax), %xmm3
111	movdqu -`64`(%eax, %ecx), %xmm4
112	movdqu -`48`(%eax, %ecx), %xmm5
113	movdqu -`32`(%eax, %ecx), %xmm6
114	movdqu -`16`(%eax, %ecx), %xmm7
115	movdqu %xmm0, (%edx)
116	movdqu %xmm1, `16`(%edx)
117	movdqu %xmm2, `32`(%edx)
118	movdqu %xmm3, `48`(%edx)
119	movdqu %xmm4, -`64`(%edx, %ecx)
120	movdqu %xmm5, -`48`(%edx, %ecx)
121	movdqu %xmm6, -`32`(%edx, %ecx)
122	movdqu %xmm7, -`16`(%edx, %ecx)
123	jmp L(return)
124
125	L(mm_len_128_or_more_backward):
126	add %ecx, %eax
127	cmp %edx, %eax
128	movl SRC(%esp), %eax
129	jbe L(forward)
130	PUSH (%esi)
131	PUSH (%edi)
132	PUSH (%ebx)
133
134	/ Aligning the address of destination. /
135	movdqu (%eax), %xmm4
136	movdqu `16`(%eax), %xmm5
137	movdqu `32`(%eax), %xmm6
138	movdqu `48`(%eax), %xmm7
139	leal (%edx, %ecx), %esi
140	movdqu -`16`(%eax, %ecx), %xmm0
141	subl $`16`, %esp
142	movdqu %xmm0, (%esp)
143	mov %ecx, %edi
144	movl %esi, %ecx
145	andl $-`16`, %ecx
146	leal (%ecx), %ebx
147	subl %edx, %ebx
148	leal (%eax, %ebx), %eax
149	shrl $`6`, %ebx
150
151	# ifdef SHARED_CACHE_SIZE_HALF
152	cmp $SHARED_CACHE_SIZE_HALF, %edi
153	# else
154	# ifdef PIC
155	PUSH (%ebx)
156	SETUP_PIC_REG (bx)
157	add $_GLOBAL_OFFSET_TABLE_, %ebx
158	cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
159	POP (%ebx)
160	# else
161	cmp __x86_shared_cache_size_half, %edi
162	# endif
163	# endif
164	jae L(mm_large_page_loop_backward)
165
166	.p2align `4`
167	L(mm_main_loop_backward):
168
169	prefetcht0 -`128`(%eax)
170
171	movdqu -`64`(%eax), %xmm0
172	movdqu -`48`(%eax), %xmm1
173	movdqu -`32`(%eax), %xmm2
174	movdqu -`16`(%eax), %xmm3
175	movaps %xmm0, -`64`(%ecx)
176	subl $`64`, %eax
177	movaps %xmm1, -`48`(%ecx)
178	movaps %xmm2, -`32`(%ecx)
179	movaps %xmm3, -`16`(%ecx)
180	subl $`64`, %ecx
181	sub $`1`, %ebx
182	jnz L(mm_main_loop_backward)
183	movdqu (%esp), %xmm0
184	addl $`16`, %esp
185	movdqu %xmm0, -`16`(%esi)
186	movdqu %xmm4, (%edx)
187	movdqu %xmm5, `16`(%edx)
188	movdqu %xmm6, `32`(%edx)
189	movdqu %xmm7, `48`(%edx)
190	POP (%ebx)
191	jmp L(mm_return_pop_all)
192
193	/ Copy [0..16] and return. /
194	L(mm_len_0_16_bytes_backward):
195	testb $`24`, %cl
196	jnz L(mm_len_9_16_bytes_backward)
197	testb $`4`, %cl
198	.p2align `4`,,`5`
199	jnz L(mm_len_5_8_bytes_backward)
200	testl %ecx, %ecx
201	.p2align `4`,,`2`
202	je L(return)
203	testb $`2`, %cl
204	.p2align `4`,,`1`
205	jne L(mm_len_3_4_bytes_backward)
206	movzbl -`1`(%eax,%ecx), %ebx
207	movzbl (%eax), %eax
208	movb %bl, -`1`(%edx,%ecx)
209	movb %al, (%edx)
210	jmp L(return)
211
212	L(mm_len_3_4_bytes_backward):
213	movzwl -`2`(%eax,%ecx), %ebx
214	movzwl (%eax), %eax
215	movw %bx, -`2`(%edx,%ecx)
216	movw %ax, (%edx)
217	jmp L(return)
218
219	L(mm_len_9_16_bytes_backward):
220	PUSH (%esi)
221	movl -`4`(%eax,%ecx), %ebx
222	movl -`8`(%eax,%ecx), %esi
223	movl %ebx, -`4`(%edx,%ecx)
224	movl %esi, -`8`(%edx,%ecx)
225	subl $`8`, %ecx
226	POP (%esi)
227	jmp L(mm_len_0_16_bytes_backward)
228
229	L(mm_len_5_8_bytes_backward):
230	movl (%eax), %ebx
231	movl -`4`(%eax,%ecx), %eax
232	movl %ebx, (%edx)
233	movl %eax, -`4`(%edx,%ecx)
234	jmp L(return)
235
236	/ Big length copy backward part. /
237	.p2align `4`
238	L(mm_large_page_loop_backward):
239	movdqu -`64`(%eax), %xmm0
240	movdqu -`48`(%eax), %xmm1
241	movdqu -`32`(%eax), %xmm2
242	movdqu -`16`(%eax), %xmm3
243	movntdq %xmm0, -`64`(%ecx)
244	subl $`64`, %eax
245	movntdq %xmm1, -`48`(%ecx)
246	movntdq %xmm2, -`32`(%ecx)
247	movntdq %xmm3, -`16`(%ecx)
248	subl $`64`, %ecx
249	sub $`1`, %ebx
250	jnz L(mm_large_page_loop_backward)
251	sfence
252	movdqu (%esp), %xmm0
253	addl $`16`, %esp
254	movdqu %xmm0, -`16`(%esi)
255	movdqu %xmm4, (%edx)
256	movdqu %xmm5, `16`(%edx)
257	movdqu %xmm6, `32`(%edx)
258	movdqu %xmm7, `48`(%edx)
259	POP (%ebx)
260	jmp L(mm_return_pop_all)
261
262	L(check_forward):
263	add %edx, %ecx
264	cmp %eax, %ecx
265	movl LEN(%esp), %ecx
266	jbe L(forward)
267
268	/ Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]*
269	separately. /*
270	cmp $`16`, %ecx
271	jbe L(mm_len_0_16_bytes_forward)
272
273	cmpl $`32`, %ecx
274	ja L(mm_len_32_or_more_forward)
275
276	/ Copy [0..32] and return. /
277	movdqu (%eax), %xmm0
278	movdqu -`16`(%eax, %ecx), %xmm1
279	movdqu %xmm0, (%edx)
280	movdqu %xmm1, -`16`(%edx, %ecx)
281	jmp L(return)
282
283	L(mm_len_32_or_more_forward):
284	cmpl $`64`, %ecx
285	ja L(mm_len_64_or_more_forward)
286
287	/ Copy [0..64] and return. /
288	movdqu (%eax), %xmm0
289	movdqu `16`(%eax), %xmm1
290	movdqu -`16`(%eax, %ecx), %xmm2
291	movdqu -`32`(%eax, %ecx), %xmm3
292	movdqu %xmm0, (%edx)
293	movdqu %xmm1, `16`(%edx)
294	movdqu %xmm2, -`16`(%edx, %ecx)
295	movdqu %xmm3, -`32`(%edx, %ecx)
296	jmp L(return)
297
298	L(mm_len_64_or_more_forward):
299	cmpl $`128`, %ecx
300	ja L(mm_len_128_or_more_forward)
301
302	/ Copy [0..128] and return. /
303	movdqu (%eax), %xmm0
304	movdqu `16`(%eax), %xmm1
305	movdqu `32`(%eax), %xmm2
306	movdqu `48`(%eax), %xmm3
307	movdqu -`64`(%eax, %ecx), %xmm4
308	movdqu -`48`(%eax, %ecx), %xmm5
309	movdqu -`32`(%eax, %ecx), %xmm6
310	movdqu -`16`(%eax, %ecx), %xmm7
311	movdqu %xmm0, (%edx)
312	movdqu %xmm1, `16`(%edx)
313	movdqu %xmm2, `32`(%edx)
314	movdqu %xmm3, `48`(%edx)
315	movdqu %xmm4, -`64`(%edx, %ecx)
316	movdqu %xmm5, -`48`(%edx, %ecx)
317	movdqu %xmm6, -`32`(%edx, %ecx)
318	movdqu %xmm7, -`16`(%edx, %ecx)
319	jmp L(return)
320
321	L(mm_len_128_or_more_forward):
322	PUSH (%esi)
323	PUSH (%edi)
324	PUSH (%ebx)
325
326	/ Aligning the address of destination. /
327	movdqu -`16`(%eax, %ecx), %xmm4
328	movdqu -`32`(%eax, %ecx), %xmm5
329	movdqu -`48`(%eax, %ecx), %xmm6
330	movdqu -`64`(%eax, %ecx), %xmm7
331	leal (%edx, %ecx), %esi
332	movdqu (%eax), %xmm0
333	subl $`16`, %esp
334	movdqu %xmm0, (%esp)
335	mov %ecx, %edi
336	leal `16`(%edx), %ecx
337	andl $-`16`, %ecx
338	movl %ecx, %ebx
339	subl %edx, %ebx
340	addl %ebx, %eax
341	movl %esi, %ebx
342	subl %ecx, %ebx
343	shrl $`6`, %ebx
344
345	# ifdef SHARED_CACHE_SIZE_HALF
346	cmp $SHARED_CACHE_SIZE_HALF, %edi
347	# else
348	# ifdef PIC
349	PUSH (%ebx)
350	SETUP_PIC_REG(bx)
351	add $_GLOBAL_OFFSET_TABLE_, %ebx
352	cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
353	POP (%ebx)
354	# else
355	cmp __x86_shared_cache_size_half, %edi
356	# endif
357	# endif
358	jae L(mm_large_page_loop_forward)
359
360	.p2align `4`
361	L(mm_main_loop_forward):
362
363	prefetcht0 `128`(%eax)
364
365	movdqu (%eax), %xmm0
366	movdqu `16`(%eax), %xmm1
367	movdqu `32`(%eax), %xmm2
368	movdqu `48`(%eax), %xmm3
369	movdqa %xmm0, (%ecx)
370	addl $`64`, %eax
371	movaps %xmm1, `16`(%ecx)
372	movaps %xmm2, `32`(%ecx)
373	movaps %xmm3, `48`(%ecx)
374	addl $`64`, %ecx
375	sub $`1`, %ebx
376	jnz L(mm_main_loop_forward)
377	movdqu (%esp), %xmm0
378	addl $`16`, %esp
379	movdqu %xmm0, (%edx)
380	movdqu %xmm4, -`16`(%esi)
381	movdqu %xmm5, -`32`(%esi)
382	movdqu %xmm6, -`48`(%esi)
383	movdqu %xmm7, -`64`(%esi)
384	POP (%ebx)
385	jmp L(mm_return_pop_all)
386
387	L(mm_len_0_16_bytes_forward):
388	testb $`24`, %cl
389	jne L(mm_len_9_16_bytes_forward)
390	testb $`4`, %cl
391	.p2align `4`,,`5`
392	jne L(mm_len_5_8_bytes_forward)
393	testl %ecx, %ecx
394	.p2align `4`,,`2`
395	je L(return)
396	testb $`2`, %cl
397	.p2align `4`,,`1`
398	jne L(mm_len_2_4_bytes_forward)
399	movzbl -`1`(%eax,%ecx), %ebx
400	movzbl (%eax), %eax
401	movb %bl, -`1`(%edx,%ecx)
402	movb %al, (%edx)
403	jmp L(return)
404
405	L(mm_len_2_4_bytes_forward):
406	movzwl -`2`(%eax,%ecx), %ebx
407	movzwl (%eax), %eax
408	movw %bx, -`2`(%edx,%ecx)
409	movw %ax, (%edx)
410	jmp L(return)
411
412	L(mm_len_5_8_bytes_forward):
413	movl (%eax), %ebx
414	movl -`4`(%eax,%ecx), %eax
415	movl %ebx, (%edx)
416	movl %eax, -`4`(%edx,%ecx)
417	jmp L(return)
418
419	L(mm_len_9_16_bytes_forward):
420	movq (%eax), %xmm0
421	movq -`8`(%eax, %ecx), %xmm1
422	movq %xmm0, (%edx)
423	movq %xmm1, -`8`(%edx, %ecx)
424	jmp L(return)
425
426	L(mm_return_pop_all):
427	movl %edx, %eax
428	POP (%edi)
429	POP (%esi)
430	RETURN
431
432	/ Big length copy forward part. /
433	.p2align `4`
434	L(mm_large_page_loop_forward):
435	movdqu (%eax), %xmm0
436	movdqu `16`(%eax), %xmm1
437	movdqu `32`(%eax), %xmm2
438	movdqu `48`(%eax), %xmm3
439	movntdq %xmm0, (%ecx)
440	addl $`64`, %eax
441	movntdq %xmm1, `16`(%ecx)
442	movntdq %xmm2, `32`(%ecx)
443	movntdq %xmm3, `48`(%ecx)
444	addl $`64`, %ecx
445	sub $`1`, %ebx
446	jnz L(mm_large_page_loop_forward)
447	sfence
448	movdqu (%esp), %xmm0
449	addl $`16`, %esp
450	movdqu %xmm0, (%edx)
451	movdqu %xmm4, -`16`(%esi)
452	movdqu %xmm5, -`32`(%esi)
453	movdqu %xmm6, -`48`(%esi)
454	movdqu %xmm7, -`64`(%esi)
455	POP (%ebx)
456	jmp L(mm_return_pop_all)
457	# endif
458
459	L(forward):
460	cmp $`16`, %ecx
461	jbe L(len_0_16_bytes)
462
463	# ifdef SHARED_CACHE_SIZE_HALF
464	cmp $SHARED_CACHE_SIZE_HALF, %ecx
465	# else
466	# ifdef PIC
467	SETUP_PIC_REG(bx)
468	add $_GLOBAL_OFFSET_TABLE_, %ebx
469	cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
470	# else
471	cmp __x86_shared_cache_size_half, %ecx
472	# endif
473	# endif
474	jae L(large_page)
475
476	movdqu (%eax), %xmm0
477	movdqu -`16`(%eax, %ecx), %xmm1
478	cmpl $`32`, %ecx
479	movdqu %xmm0, (%edx)
480	movdqu %xmm1, -`16`(%edx, %ecx)
481	jbe L(return)
482
483	movdqu `16`(%eax), %xmm0
484	movdqu -`32`(%eax, %ecx), %xmm1
485	cmpl $`64`, %ecx
486	movdqu %xmm0, `16`(%edx)
487	movdqu %xmm1, -`32`(%edx, %ecx)
488	jbe L(return)
489
490	movdqu `32`(%eax), %xmm0
491	movdqu `48`(%eax), %xmm1
492	movdqu -`48`(%eax, %ecx), %xmm2
493	movdqu -`64`(%eax, %ecx), %xmm3
494	cmpl $`128`, %ecx
495	movdqu %xmm0, `32`(%edx)
496	movdqu %xmm1, `48`(%edx)
497	movdqu %xmm2, -`48`(%edx, %ecx)
498	movdqu %xmm3, -`64`(%edx, %ecx)
499	jbe L(return)
500
501	/ Now the main loop: we align the address of the destination. /
502	leal `64`(%edx), %ebx
503	andl $-`64`, %ebx
504
505	addl %edx, %ecx
506	andl $-`64`, %ecx
507
508	subl %edx, %eax
509
510	/ We should stop two iterations before the termination*
511	(in order not to misprefetch). /*
512	subl $`64`, %ecx
513	cmpl %ebx, %ecx
514	je L(main_loop_just_one_iteration)
515
516	subl $`64`, %ecx
517	cmpl %ebx, %ecx
518	je L(main_loop_last_two_iterations)
519
520	.p2align `4`
521	L(main_loop_cache):
522
523	prefetcht0 `128`(%ebx, %eax)
524
525	movdqu (%ebx, %eax), %xmm0
526	movdqu `16`(%ebx, %eax), %xmm1
527	movdqu `32`(%ebx, %eax), %xmm2
528	movdqu `48`(%ebx, %eax), %xmm3
529	movdqa %xmm0, (%ebx)
530	movaps %xmm1, `16`(%ebx)
531	movaps %xmm2, `32`(%ebx)
532	movaps %xmm3, `48`(%ebx)
533	lea `64`(%ebx), %ebx
534	cmpl %ebx, %ecx
535	jne L(main_loop_cache)
536
537	L(main_loop_last_two_iterations):
538	movdqu (%ebx, %eax), %xmm0
539	movdqu `16`(%ebx, %eax), %xmm1
540	movdqu `32`(%ebx, %eax), %xmm2
541	movdqu `48`(%ebx, %eax), %xmm3
542	movdqu `64`(%ebx, %eax), %xmm4
543	movdqu `80`(%ebx, %eax), %xmm5
544	movdqu `96`(%ebx, %eax), %xmm6
545	movdqu `112`(%ebx, %eax), %xmm7
546	movdqa %xmm0, (%ebx)
547	movaps %xmm1, `16`(%ebx)
548	movaps %xmm2, `32`(%ebx)
549	movaps %xmm3, `48`(%ebx)
550	movaps %xmm4, `64`(%ebx)
551	movaps %xmm5, `80`(%ebx)
552	movaps %xmm6, `96`(%ebx)
553	movaps %xmm7, `112`(%ebx)
554	jmp L(return)
555
556	L(main_loop_just_one_iteration):
557	movdqu (%ebx, %eax), %xmm0
558	movdqu `16`(%ebx, %eax), %xmm1
559	movdqu `32`(%ebx, %eax), %xmm2
560	movdqu `48`(%ebx, %eax), %xmm3
561	movdqa %xmm0, (%ebx)
562	movaps %xmm1, `16`(%ebx)
563	movaps %xmm2, `32`(%ebx)
564	movaps %xmm3, `48`(%ebx)
565	jmp L(return)
566
567	L(large_page):
568	movdqu (%eax), %xmm0
569	movdqu `16`(%eax), %xmm1
570	movdqu `32`(%eax), %xmm2
571	movdqu `48`(%eax), %xmm3
572	movdqu -`64`(%eax, %ecx), %xmm4
573	movdqu -`48`(%eax, %ecx), %xmm5
574	movdqu -`32`(%eax, %ecx), %xmm6
575	movdqu -`16`(%eax, %ecx), %xmm7
576	movdqu %xmm0, (%edx)
577	movdqu %xmm1, `16`(%edx)
578	movdqu %xmm2, `32`(%edx)
579	movdqu %xmm3, `48`(%edx)
580	movdqu %xmm4, -`64`(%edx, %ecx)
581	movdqu %xmm5, -`48`(%edx, %ecx)
582	movdqu %xmm6, -`32`(%edx, %ecx)
583	movdqu %xmm7, -`16`(%edx, %ecx)
584
585	movdqu `64`(%eax), %xmm0
586	movdqu `80`(%eax), %xmm1
587	movdqu `96`(%eax), %xmm2
588	movdqu `112`(%eax), %xmm3
589	movdqu -`128`(%eax, %ecx), %xmm4
590	movdqu -`112`(%eax, %ecx), %xmm5
591	movdqu -`96`(%eax, %ecx), %xmm6
592	movdqu -`80`(%eax, %ecx), %xmm7
593	movdqu %xmm0, `64`(%edx)
594	movdqu %xmm1, `80`(%edx)
595	movdqu %xmm2, `96`(%edx)
596	movdqu %xmm3, `112`(%edx)
597	movdqu %xmm4, -`128`(%edx, %ecx)
598	movdqu %xmm5, -`112`(%edx, %ecx)
599	movdqu %xmm6, -`96`(%edx, %ecx)
600	movdqu %xmm7, -`80`(%edx, %ecx)
601
602	/ Now the main loop with non temporal stores. We align*
603	the address of the destination. /*
604	leal `128`(%edx), %ebx
605	andl $-`128`, %ebx
606
607	addl %edx, %ecx
608	andl $-`128`, %ecx
609
610	subl %edx, %eax
611
612	.p2align `4`
613	L(main_loop_large_page):
614	movdqu (%ebx, %eax), %xmm0
615	movdqu `16`(%ebx, %eax), %xmm1
616	movdqu `32`(%ebx, %eax), %xmm2
617	movdqu `48`(%ebx, %eax), %xmm3
618	movdqu `64`(%ebx, %eax), %xmm4
619	movdqu `80`(%ebx, %eax), %xmm5
620	movdqu `96`(%ebx, %eax), %xmm6
621	movdqu `112`(%ebx, %eax), %xmm7
622	movntdq %xmm0, (%ebx)
623	movntdq %xmm1, `16`(%ebx)
624	movntdq %xmm2, `32`(%ebx)
625	movntdq %xmm3, `48`(%ebx)
626	movntdq %xmm4, `64`(%ebx)
627	movntdq %xmm5, `80`(%ebx)
628	movntdq %xmm6, `96`(%ebx)
629	movntdq %xmm7, `112`(%ebx)
630	lea `128`(%ebx), %ebx
631	cmpl %ebx, %ecx
632	jne L(main_loop_large_page)
633	sfence
634	jmp L(return)
635
636	L(len_0_16_bytes):
637	testb $`24`, %cl
638	jne L(len_9_16_bytes)
639	testb $`4`, %cl
640	.p2align `4`,,`5`
641	jne L(len_5_8_bytes)
642	testl %ecx, %ecx
643	.p2align `4`,,`2`
644	je L(return)
645	movzbl (%eax), %ebx
646	testb $`2`, %cl
647	movb %bl, (%edx)
648	je L(return)
649	movzwl -`2`(%eax,%ecx), %ebx
650	movw %bx, -`2`(%edx,%ecx)
651	jmp L(return)
652
653	L(len_9_16_bytes):
654	movq (%eax), %xmm0
655	movq -`8`(%eax, %ecx), %xmm1
656	movq %xmm0, (%edx)
657	movq %xmm1, -`8`(%edx, %ecx)
658	jmp L(return)
659
660	L(len_5_8_bytes):
661	movl (%eax), %ebx
662	movl %ebx, (%edx)
663	movl -`4`(%eax,%ecx), %ebx
664	movl %ebx, -`4`(%edx,%ecx)
665
666	L(return):
667	movl %edx, %eax
668	# ifdef USE_AS_MEMPCPY
669	movl LEN(%esp), %ecx
670	add %ecx, %eax
671	# endif
672	RETURN
673
674	END (MEMCPY)
675	#endif
676

source code of glibc/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S