memmove-avx512-no-vzeroupper.S source code [glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S]

1	/ memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.*
2	Copyright (C) 2016-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	#if IS_IN (libc)
22
23	# include "asm-syntax.h"
24
25	.section .text.avx512,"ax",@progbits
26	ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
27	cmp %RDX_LP, %RCX_LP
28	jb HIDDEN_JUMPTARGET (__chk_fail)
29	END (__mempcpy_chk_avx512_no_vzeroupper)
30
31	ENTRY (__mempcpy_avx512_no_vzeroupper)
32	mov %RDI_LP, %RAX_LP
33	add %RDX_LP, %RAX_LP
34	jmp L(start)
35	END (__mempcpy_avx512_no_vzeroupper)
36
37	ENTRY (__memmove_chk_avx512_no_vzeroupper)
38	cmp %RDX_LP, %RCX_LP
39	jb HIDDEN_JUMPTARGET (__chk_fail)
40	END (__memmove_chk_avx512_no_vzeroupper)
41
42	ENTRY (__memmove_avx512_no_vzeroupper)
43	mov %RDI_LP, %RAX_LP
44	# ifdef USE_AS_MEMPCPY
45	add %RDX_LP, %RAX_LP
46	# endif
47	L(start):
48	# ifdef __ILP32__
49	/ Clear the upper 32 bits. /
50	mov %edx, %edx
51	# endif
52	lea (%rsi, %rdx), %rcx
53	lea (%rdi, %rdx), %r9
54	cmp $`512`, %rdx
55	ja L(`512bytesormore`)
56
57	L(check):
58	cmp $`16`, %rdx
59	jbe L(less_16bytes)
60	cmp $`256`, %rdx
61	jb L(less_256bytes)
62	vmovups (%rsi), %zmm0
63	vmovups `0x40`(%rsi), %zmm1
64	vmovups `0x80`(%rsi), %zmm2
65	vmovups `0xC0`(%rsi), %zmm3
66	vmovups -`0x100`(%rcx), %zmm4
67	vmovups -`0xC0`(%rcx), %zmm5
68	vmovups -`0x80`(%rcx), %zmm6
69	vmovups -`0x40`(%rcx), %zmm7
70	vmovups %zmm0, (%rdi)
71	vmovups %zmm1, `0x40`(%rdi)
72	vmovups %zmm2, `0x80`(%rdi)
73	vmovups %zmm3, `0xC0`(%rdi)
74	vmovups %zmm4, -`0x100`(%r9)
75	vmovups %zmm5, -`0xC0`(%r9)
76	vmovups %zmm6, -`0x80`(%r9)
77	vmovups %zmm7, -`0x40`(%r9)
78	ret
79
80	L(less_256bytes):
81	cmp $`128`, %dl
82	jb L(less_128bytes)
83	vmovups (%rsi), %zmm0
84	vmovups `0x40`(%rsi), %zmm1
85	vmovups -`0x80`(%rcx), %zmm2
86	vmovups -`0x40`(%rcx), %zmm3
87	vmovups %zmm0, (%rdi)
88	vmovups %zmm1, `0x40`(%rdi)
89	vmovups %zmm2, -`0x80`(%r9)
90	vmovups %zmm3, -`0x40`(%r9)
91	ret
92
93	L(less_128bytes):
94	cmp $`64`, %dl
95	jb L(less_64bytes)
96	vmovdqu (%rsi), %ymm0
97	vmovdqu `0x20`(%rsi), %ymm1
98	vmovdqu -`0x40`(%rcx), %ymm2
99	vmovdqu -`0x20`(%rcx), %ymm3
100	vmovdqu %ymm0, (%rdi)
101	vmovdqu %ymm1, `0x20`(%rdi)
102	vmovdqu %ymm2, -`0x40`(%r9)
103	vmovdqu %ymm3, -`0x20`(%r9)
104	ret
105
106	L(less_64bytes):
107	cmp $`32`, %dl
108	jb L(less_32bytes)
109	vmovdqu (%rsi), %ymm0
110	vmovdqu -`0x20`(%rcx), %ymm1
111	vmovdqu %ymm0, (%rdi)
112	vmovdqu %ymm1, -`0x20`(%r9)
113	ret
114
115	L(less_32bytes):
116	vmovdqu (%rsi), %xmm0
117	vmovdqu -`0x10`(%rcx), %xmm1
118	vmovdqu %xmm0, (%rdi)
119	vmovdqu %xmm1, -`0x10`(%r9)
120	ret
121
122	L(less_16bytes):
123	cmp $`8`, %dl
124	jb L(less_8bytes)
125	movq (%rsi), %rsi
126	movq -`0x8`(%rcx), %rcx
127	movq %rsi, (%rdi)
128	movq %rcx, -`0x8`(%r9)
129	ret
130
131	L(less_8bytes):
132	cmp $`4`, %dl
133	jb L(less_4bytes)
134	mov (%rsi), %esi
135	mov -`0x4`(%rcx), %ecx
136	mov %esi, (%rdi)
137	mov %ecx, -`0x4`(%r9)
138	ret
139
140	L(less_4bytes):
141	cmp $`2`, %dl
142	jb L(less_2bytes)
143	mov (%rsi), %si
144	mov -`0x2`(%rcx), %cx
145	mov %si, (%rdi)
146	mov %cx, -`0x2`(%r9)
147	ret
148
149	L(less_2bytes):
150	cmp $`1`, %dl
151	jb L(less_1bytes)
152	mov (%rsi), %cl
153	mov %cl, (%rdi)
154	L(less_1bytes):
155	ret
156
157	L(`512bytesormore`):
158	# ifdef SHARED_CACHE_SIZE_HALF
159	mov $SHARED_CACHE_SIZE_HALF, %r8
160	# else
161	mov __x86_shared_cache_size_half(%rip), %r8
162	# endif
163	cmp %r8, %rdx
164	jae L(preloop_large)
165	cmp $`1024`, %rdx
166	ja L(`1024bytesormore`)
167	prefetcht1 (%rsi)
168	prefetcht1 `0x40`(%rsi)
169	prefetcht1 `0x80`(%rsi)
170	prefetcht1 `0xC0`(%rsi)
171	prefetcht1 `0x100`(%rsi)
172	prefetcht1 `0x140`(%rsi)
173	prefetcht1 `0x180`(%rsi)
174	prefetcht1 `0x1C0`(%rsi)
175	prefetcht1 -`0x200`(%rcx)
176	prefetcht1 -`0x1C0`(%rcx)
177	prefetcht1 -`0x180`(%rcx)
178	prefetcht1 -`0x140`(%rcx)
179	prefetcht1 -`0x100`(%rcx)
180	prefetcht1 -`0xC0`(%rcx)
181	prefetcht1 -`0x80`(%rcx)
182	prefetcht1 -`0x40`(%rcx)
183	vmovups (%rsi), %zmm0
184	vmovups `0x40`(%rsi), %zmm1
185	vmovups `0x80`(%rsi), %zmm2
186	vmovups `0xC0`(%rsi), %zmm3
187	vmovups `0x100`(%rsi), %zmm4
188	vmovups `0x140`(%rsi), %zmm5
189	vmovups `0x180`(%rsi), %zmm6
190	vmovups `0x1C0`(%rsi), %zmm7
191	vmovups -`0x200`(%rcx), %zmm8
192	vmovups -`0x1C0`(%rcx), %zmm9
193	vmovups -`0x180`(%rcx), %zmm10
194	vmovups -`0x140`(%rcx), %zmm11
195	vmovups -`0x100`(%rcx), %zmm12
196	vmovups -`0xC0`(%rcx), %zmm13
197	vmovups -`0x80`(%rcx), %zmm14
198	vmovups -`0x40`(%rcx), %zmm15
199	vmovups %zmm0, (%rdi)
200	vmovups %zmm1, `0x40`(%rdi)
201	vmovups %zmm2, `0x80`(%rdi)
202	vmovups %zmm3, `0xC0`(%rdi)
203	vmovups %zmm4, `0x100`(%rdi)
204	vmovups %zmm5, `0x140`(%rdi)
205	vmovups %zmm6, `0x180`(%rdi)
206	vmovups %zmm7, `0x1C0`(%rdi)
207	vmovups %zmm8, -`0x200`(%r9)
208	vmovups %zmm9, -`0x1C0`(%r9)
209	vmovups %zmm10, -`0x180`(%r9)
210	vmovups %zmm11, -`0x140`(%r9)
211	vmovups %zmm12, -`0x100`(%r9)
212	vmovups %zmm13, -`0xC0`(%r9)
213	vmovups %zmm14, -`0x80`(%r9)
214	vmovups %zmm15, -`0x40`(%r9)
215	ret
216
217	L(`1024bytesormore`):
218	cmp %rsi, %rdi
219	ja L(`1024bytesormore_bkw`)
220	sub $`512`, %r9
221	vmovups -`0x200`(%rcx), %zmm8
222	vmovups -`0x1C0`(%rcx), %zmm9
223	vmovups -`0x180`(%rcx), %zmm10
224	vmovups -`0x140`(%rcx), %zmm11
225	vmovups -`0x100`(%rcx), %zmm12
226	vmovups -`0xC0`(%rcx), %zmm13
227	vmovups -`0x80`(%rcx), %zmm14
228	vmovups -`0x40`(%rcx), %zmm15
229	prefetcht1 (%rsi)
230	prefetcht1 `0x40`(%rsi)
231	prefetcht1 `0x80`(%rsi)
232	prefetcht1 `0xC0`(%rsi)
233	prefetcht1 `0x100`(%rsi)
234	prefetcht1 `0x140`(%rsi)
235	prefetcht1 `0x180`(%rsi)
236	prefetcht1 `0x1C0`(%rsi)
237
238	/ Loop with unaligned memory access. /
239	L(gobble_512bytes_loop):
240	vmovups (%rsi), %zmm0
241	vmovups `0x40`(%rsi), %zmm1
242	vmovups `0x80`(%rsi), %zmm2
243	vmovups `0xC0`(%rsi), %zmm3
244	vmovups `0x100`(%rsi), %zmm4
245	vmovups `0x140`(%rsi), %zmm5
246	vmovups `0x180`(%rsi), %zmm6
247	vmovups `0x1C0`(%rsi), %zmm7
248	add $`512`, %rsi
249	prefetcht1 (%rsi)
250	prefetcht1 `0x40`(%rsi)
251	prefetcht1 `0x80`(%rsi)
252	prefetcht1 `0xC0`(%rsi)
253	prefetcht1 `0x100`(%rsi)
254	prefetcht1 `0x140`(%rsi)
255	prefetcht1 `0x180`(%rsi)
256	prefetcht1 `0x1C0`(%rsi)
257	vmovups %zmm0, (%rdi)
258	vmovups %zmm1, `0x40`(%rdi)
259	vmovups %zmm2, `0x80`(%rdi)
260	vmovups %zmm3, `0xC0`(%rdi)
261	vmovups %zmm4, `0x100`(%rdi)
262	vmovups %zmm5, `0x140`(%rdi)
263	vmovups %zmm6, `0x180`(%rdi)
264	vmovups %zmm7, `0x1C0`(%rdi)
265	add $`512`, %rdi
266	cmp %r9, %rdi
267	jb L(gobble_512bytes_loop)
268	vmovups %zmm8, (%r9)
269	vmovups %zmm9, `0x40`(%r9)
270	vmovups %zmm10, `0x80`(%r9)
271	vmovups %zmm11, `0xC0`(%r9)
272	vmovups %zmm12, `0x100`(%r9)
273	vmovups %zmm13, `0x140`(%r9)
274	vmovups %zmm14, `0x180`(%r9)
275	vmovups %zmm15, `0x1C0`(%r9)
276	ret
277
278	L(`1024bytesormore_bkw`):
279	add $`512`, %rdi
280	vmovups `0x1C0`(%rsi), %zmm8
281	vmovups `0x180`(%rsi), %zmm9
282	vmovups `0x140`(%rsi), %zmm10
283	vmovups `0x100`(%rsi), %zmm11
284	vmovups `0xC0`(%rsi), %zmm12
285	vmovups `0x80`(%rsi), %zmm13
286	vmovups `0x40`(%rsi), %zmm14
287	vmovups (%rsi), %zmm15
288	prefetcht1 -`0x40`(%rcx)
289	prefetcht1 -`0x80`(%rcx)
290	prefetcht1 -`0xC0`(%rcx)
291	prefetcht1 -`0x100`(%rcx)
292	prefetcht1 -`0x140`(%rcx)
293	prefetcht1 -`0x180`(%rcx)
294	prefetcht1 -`0x1C0`(%rcx)
295	prefetcht1 -`0x200`(%rcx)
296
297	/ Backward loop with unaligned memory access. /
298	L(gobble_512bytes_loop_bkw):
299	vmovups -`0x40`(%rcx), %zmm0
300	vmovups -`0x80`(%rcx), %zmm1
301	vmovups -`0xC0`(%rcx), %zmm2
302	vmovups -`0x100`(%rcx), %zmm3
303	vmovups -`0x140`(%rcx), %zmm4
304	vmovups -`0x180`(%rcx), %zmm5
305	vmovups -`0x1C0`(%rcx), %zmm6
306	vmovups -`0x200`(%rcx), %zmm7
307	sub $`512`, %rcx
308	prefetcht1 -`0x40`(%rcx)
309	prefetcht1 -`0x80`(%rcx)
310	prefetcht1 -`0xC0`(%rcx)
311	prefetcht1 -`0x100`(%rcx)
312	prefetcht1 -`0x140`(%rcx)
313	prefetcht1 -`0x180`(%rcx)
314	prefetcht1 -`0x1C0`(%rcx)
315	prefetcht1 -`0x200`(%rcx)
316	vmovups %zmm0, -`0x40`(%r9)
317	vmovups %zmm1, -`0x80`(%r9)
318	vmovups %zmm2, -`0xC0`(%r9)
319	vmovups %zmm3, -`0x100`(%r9)
320	vmovups %zmm4, -`0x140`(%r9)
321	vmovups %zmm5, -`0x180`(%r9)
322	vmovups %zmm6, -`0x1C0`(%r9)
323	vmovups %zmm7, -`0x200`(%r9)
324	sub $`512`, %r9
325	cmp %rdi, %r9
326	ja L(gobble_512bytes_loop_bkw)
327	vmovups %zmm8, -`0x40`(%rdi)
328	vmovups %zmm9, -`0x80`(%rdi)
329	vmovups %zmm10, -`0xC0`(%rdi)
330	vmovups %zmm11, -`0x100`(%rdi)
331	vmovups %zmm12, -`0x140`(%rdi)
332	vmovups %zmm13, -`0x180`(%rdi)
333	vmovups %zmm14, -`0x1C0`(%rdi)
334	vmovups %zmm15, -`0x200`(%rdi)
335	ret
336
337	L(preloop_large):
338	cmp %rsi, %rdi
339	ja L(preloop_large_bkw)
340	vmovups (%rsi), %zmm4
341	vmovups `0x40`(%rsi), %zmm5
342
343	mov %rdi, %r11
344	/ Align destination for access with non-temporal stores in the loop. /
345	mov %rdi, %r8
346	and $-`0x80`, %rdi
347	add $`0x80`, %rdi
348	sub %rdi, %r8
349	sub %r8, %rsi
350	add %r8, %rdx
351	L(gobble_256bytes_nt_loop):
352	prefetcht1 `0x200`(%rsi)
353	prefetcht1 `0x240`(%rsi)
354	prefetcht1 `0x280`(%rsi)
355	prefetcht1 `0x2C0`(%rsi)
356	prefetcht1 `0x300`(%rsi)
357	prefetcht1 `0x340`(%rsi)
358	prefetcht1 `0x380`(%rsi)
359	prefetcht1 `0x3C0`(%rsi)
360	vmovdqu64 (%rsi), %zmm0
361	vmovdqu64 `0x40`(%rsi), %zmm1
362	vmovdqu64 `0x80`(%rsi), %zmm2
363	vmovdqu64 `0xC0`(%rsi), %zmm3
364	vmovntdq %zmm0, (%rdi)
365	vmovntdq %zmm1, `0x40`(%rdi)
366	vmovntdq %zmm2, `0x80`(%rdi)
367	vmovntdq %zmm3, `0xC0`(%rdi)
368	sub $`256`, %rdx
369	add $`256`, %rsi
370	add $`256`, %rdi
371	cmp $`256`, %rdx
372	ja L(gobble_256bytes_nt_loop)
373	sfence
374	vmovups %zmm4, (%r11)
375	vmovups %zmm5, `0x40`(%r11)
376	jmp L(check)
377
378	L(preloop_large_bkw):
379	vmovups -`0x80`(%rcx), %zmm4
380	vmovups -`0x40`(%rcx), %zmm5
381
382	/ Align end of destination for access with non-temporal stores. /
383	mov %r9, %r8
384	and $-`0x80`, %r9
385	sub %r9, %r8
386	sub %r8, %rcx
387	sub %r8, %rdx
388	add %r9, %r8
389	L(gobble_256bytes_nt_loop_bkw):
390	prefetcht1 -`0x400`(%rcx)
391	prefetcht1 -`0x3C0`(%rcx)
392	prefetcht1 -`0x380`(%rcx)
393	prefetcht1 -`0x340`(%rcx)
394	prefetcht1 -`0x300`(%rcx)
395	prefetcht1 -`0x2C0`(%rcx)
396	prefetcht1 -`0x280`(%rcx)
397	prefetcht1 -`0x240`(%rcx)
398	vmovdqu64 -`0x100`(%rcx), %zmm0
399	vmovdqu64 -`0xC0`(%rcx), %zmm1
400	vmovdqu64 -`0x80`(%rcx), %zmm2
401	vmovdqu64 -`0x40`(%rcx), %zmm3
402	vmovntdq %zmm0, -`0x100`(%r9)
403	vmovntdq %zmm1, -`0xC0`(%r9)
404	vmovntdq %zmm2, -`0x80`(%r9)
405	vmovntdq %zmm3, -`0x40`(%r9)
406	sub $`256`, %rdx
407	sub $`256`, %rcx
408	sub $`256`, %r9
409	cmp $`256`, %rdx
410	ja L(gobble_256bytes_nt_loop_bkw)
411	sfence
412	vmovups %zmm4, -`0x80`(%r8)
413	vmovups %zmm5, -`0x40`(%r8)
414	jmp L(check)
415	END (__memmove_avx512_no_vzeroupper)
416
417	strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
418	strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
419	#endif
420

source code of glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S