1/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
2 Copyright (C) 2016-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#if IS_IN (libc)
22
23# include "asm-syntax.h"
24
25 .section .text.avx512,"ax",@progbits
26ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
27 cmp %RDX_LP, %RCX_LP
28 jb HIDDEN_JUMPTARGET (__chk_fail)
29END (__mempcpy_chk_avx512_no_vzeroupper)
30
31ENTRY (__mempcpy_avx512_no_vzeroupper)
32 mov %RDI_LP, %RAX_LP
33 add %RDX_LP, %RAX_LP
34 jmp L(start)
35END (__mempcpy_avx512_no_vzeroupper)
36
37ENTRY (__memmove_chk_avx512_no_vzeroupper)
38 cmp %RDX_LP, %RCX_LP
39 jb HIDDEN_JUMPTARGET (__chk_fail)
40END (__memmove_chk_avx512_no_vzeroupper)
41
42ENTRY (__memmove_avx512_no_vzeroupper)
43 mov %RDI_LP, %RAX_LP
44# ifdef USE_AS_MEMPCPY
45 add %RDX_LP, %RAX_LP
46# endif
47L(start):
48# ifdef __ILP32__
49 /* Clear the upper 32 bits. */
50 mov %edx, %edx
51# endif
52 lea (%rsi, %rdx), %rcx
53 lea (%rdi, %rdx), %r9
54 cmp $512, %rdx
55 ja L(512bytesormore)
56
57L(check):
58 cmp $16, %rdx
59 jbe L(less_16bytes)
60 cmp $256, %rdx
61 jb L(less_256bytes)
62 vmovups (%rsi), %zmm0
63 vmovups 0x40(%rsi), %zmm1
64 vmovups 0x80(%rsi), %zmm2
65 vmovups 0xC0(%rsi), %zmm3
66 vmovups -0x100(%rcx), %zmm4
67 vmovups -0xC0(%rcx), %zmm5
68 vmovups -0x80(%rcx), %zmm6
69 vmovups -0x40(%rcx), %zmm7
70 vmovups %zmm0, (%rdi)
71 vmovups %zmm1, 0x40(%rdi)
72 vmovups %zmm2, 0x80(%rdi)
73 vmovups %zmm3, 0xC0(%rdi)
74 vmovups %zmm4, -0x100(%r9)
75 vmovups %zmm5, -0xC0(%r9)
76 vmovups %zmm6, -0x80(%r9)
77 vmovups %zmm7, -0x40(%r9)
78 ret
79
80L(less_256bytes):
81 cmp $128, %dl
82 jb L(less_128bytes)
83 vmovups (%rsi), %zmm0
84 vmovups 0x40(%rsi), %zmm1
85 vmovups -0x80(%rcx), %zmm2
86 vmovups -0x40(%rcx), %zmm3
87 vmovups %zmm0, (%rdi)
88 vmovups %zmm1, 0x40(%rdi)
89 vmovups %zmm2, -0x80(%r9)
90 vmovups %zmm3, -0x40(%r9)
91 ret
92
93L(less_128bytes):
94 cmp $64, %dl
95 jb L(less_64bytes)
96 vmovdqu (%rsi), %ymm0
97 vmovdqu 0x20(%rsi), %ymm1
98 vmovdqu -0x40(%rcx), %ymm2
99 vmovdqu -0x20(%rcx), %ymm3
100 vmovdqu %ymm0, (%rdi)
101 vmovdqu %ymm1, 0x20(%rdi)
102 vmovdqu %ymm2, -0x40(%r9)
103 vmovdqu %ymm3, -0x20(%r9)
104 ret
105
106L(less_64bytes):
107 cmp $32, %dl
108 jb L(less_32bytes)
109 vmovdqu (%rsi), %ymm0
110 vmovdqu -0x20(%rcx), %ymm1
111 vmovdqu %ymm0, (%rdi)
112 vmovdqu %ymm1, -0x20(%r9)
113 ret
114
115L(less_32bytes):
116 vmovdqu (%rsi), %xmm0
117 vmovdqu -0x10(%rcx), %xmm1
118 vmovdqu %xmm0, (%rdi)
119 vmovdqu %xmm1, -0x10(%r9)
120 ret
121
122L(less_16bytes):
123 cmp $8, %dl
124 jb L(less_8bytes)
125 movq (%rsi), %rsi
126 movq -0x8(%rcx), %rcx
127 movq %rsi, (%rdi)
128 movq %rcx, -0x8(%r9)
129 ret
130
131L(less_8bytes):
132 cmp $4, %dl
133 jb L(less_4bytes)
134 mov (%rsi), %esi
135 mov -0x4(%rcx), %ecx
136 mov %esi, (%rdi)
137 mov %ecx, -0x4(%r9)
138 ret
139
140L(less_4bytes):
141 cmp $2, %dl
142 jb L(less_2bytes)
143 mov (%rsi), %si
144 mov -0x2(%rcx), %cx
145 mov %si, (%rdi)
146 mov %cx, -0x2(%r9)
147 ret
148
149L(less_2bytes):
150 cmp $1, %dl
151 jb L(less_1bytes)
152 mov (%rsi), %cl
153 mov %cl, (%rdi)
154L(less_1bytes):
155 ret
156
157L(512bytesormore):
158# ifdef SHARED_CACHE_SIZE_HALF
159 mov $SHARED_CACHE_SIZE_HALF, %r8
160# else
161 mov __x86_shared_cache_size_half(%rip), %r8
162# endif
163 cmp %r8, %rdx
164 jae L(preloop_large)
165 cmp $1024, %rdx
166 ja L(1024bytesormore)
167 prefetcht1 (%rsi)
168 prefetcht1 0x40(%rsi)
169 prefetcht1 0x80(%rsi)
170 prefetcht1 0xC0(%rsi)
171 prefetcht1 0x100(%rsi)
172 prefetcht1 0x140(%rsi)
173 prefetcht1 0x180(%rsi)
174 prefetcht1 0x1C0(%rsi)
175 prefetcht1 -0x200(%rcx)
176 prefetcht1 -0x1C0(%rcx)
177 prefetcht1 -0x180(%rcx)
178 prefetcht1 -0x140(%rcx)
179 prefetcht1 -0x100(%rcx)
180 prefetcht1 -0xC0(%rcx)
181 prefetcht1 -0x80(%rcx)
182 prefetcht1 -0x40(%rcx)
183 vmovups (%rsi), %zmm0
184 vmovups 0x40(%rsi), %zmm1
185 vmovups 0x80(%rsi), %zmm2
186 vmovups 0xC0(%rsi), %zmm3
187 vmovups 0x100(%rsi), %zmm4
188 vmovups 0x140(%rsi), %zmm5
189 vmovups 0x180(%rsi), %zmm6
190 vmovups 0x1C0(%rsi), %zmm7
191 vmovups -0x200(%rcx), %zmm8
192 vmovups -0x1C0(%rcx), %zmm9
193 vmovups -0x180(%rcx), %zmm10
194 vmovups -0x140(%rcx), %zmm11
195 vmovups -0x100(%rcx), %zmm12
196 vmovups -0xC0(%rcx), %zmm13
197 vmovups -0x80(%rcx), %zmm14
198 vmovups -0x40(%rcx), %zmm15
199 vmovups %zmm0, (%rdi)
200 vmovups %zmm1, 0x40(%rdi)
201 vmovups %zmm2, 0x80(%rdi)
202 vmovups %zmm3, 0xC0(%rdi)
203 vmovups %zmm4, 0x100(%rdi)
204 vmovups %zmm5, 0x140(%rdi)
205 vmovups %zmm6, 0x180(%rdi)
206 vmovups %zmm7, 0x1C0(%rdi)
207 vmovups %zmm8, -0x200(%r9)
208 vmovups %zmm9, -0x1C0(%r9)
209 vmovups %zmm10, -0x180(%r9)
210 vmovups %zmm11, -0x140(%r9)
211 vmovups %zmm12, -0x100(%r9)
212 vmovups %zmm13, -0xC0(%r9)
213 vmovups %zmm14, -0x80(%r9)
214 vmovups %zmm15, -0x40(%r9)
215 ret
216
217L(1024bytesormore):
218 cmp %rsi, %rdi
219 ja L(1024bytesormore_bkw)
220 sub $512, %r9
221 vmovups -0x200(%rcx), %zmm8
222 vmovups -0x1C0(%rcx), %zmm9
223 vmovups -0x180(%rcx), %zmm10
224 vmovups -0x140(%rcx), %zmm11
225 vmovups -0x100(%rcx), %zmm12
226 vmovups -0xC0(%rcx), %zmm13
227 vmovups -0x80(%rcx), %zmm14
228 vmovups -0x40(%rcx), %zmm15
229 prefetcht1 (%rsi)
230 prefetcht1 0x40(%rsi)
231 prefetcht1 0x80(%rsi)
232 prefetcht1 0xC0(%rsi)
233 prefetcht1 0x100(%rsi)
234 prefetcht1 0x140(%rsi)
235 prefetcht1 0x180(%rsi)
236 prefetcht1 0x1C0(%rsi)
237
238/* Loop with unaligned memory access. */
239L(gobble_512bytes_loop):
240 vmovups (%rsi), %zmm0
241 vmovups 0x40(%rsi), %zmm1
242 vmovups 0x80(%rsi), %zmm2
243 vmovups 0xC0(%rsi), %zmm3
244 vmovups 0x100(%rsi), %zmm4
245 vmovups 0x140(%rsi), %zmm5
246 vmovups 0x180(%rsi), %zmm6
247 vmovups 0x1C0(%rsi), %zmm7
248 add $512, %rsi
249 prefetcht1 (%rsi)
250 prefetcht1 0x40(%rsi)
251 prefetcht1 0x80(%rsi)
252 prefetcht1 0xC0(%rsi)
253 prefetcht1 0x100(%rsi)
254 prefetcht1 0x140(%rsi)
255 prefetcht1 0x180(%rsi)
256 prefetcht1 0x1C0(%rsi)
257 vmovups %zmm0, (%rdi)
258 vmovups %zmm1, 0x40(%rdi)
259 vmovups %zmm2, 0x80(%rdi)
260 vmovups %zmm3, 0xC0(%rdi)
261 vmovups %zmm4, 0x100(%rdi)
262 vmovups %zmm5, 0x140(%rdi)
263 vmovups %zmm6, 0x180(%rdi)
264 vmovups %zmm7, 0x1C0(%rdi)
265 add $512, %rdi
266 cmp %r9, %rdi
267 jb L(gobble_512bytes_loop)
268 vmovups %zmm8, (%r9)
269 vmovups %zmm9, 0x40(%r9)
270 vmovups %zmm10, 0x80(%r9)
271 vmovups %zmm11, 0xC0(%r9)
272 vmovups %zmm12, 0x100(%r9)
273 vmovups %zmm13, 0x140(%r9)
274 vmovups %zmm14, 0x180(%r9)
275 vmovups %zmm15, 0x1C0(%r9)
276 ret
277
278L(1024bytesormore_bkw):
279 add $512, %rdi
280 vmovups 0x1C0(%rsi), %zmm8
281 vmovups 0x180(%rsi), %zmm9
282 vmovups 0x140(%rsi), %zmm10
283 vmovups 0x100(%rsi), %zmm11
284 vmovups 0xC0(%rsi), %zmm12
285 vmovups 0x80(%rsi), %zmm13
286 vmovups 0x40(%rsi), %zmm14
287 vmovups (%rsi), %zmm15
288 prefetcht1 -0x40(%rcx)
289 prefetcht1 -0x80(%rcx)
290 prefetcht1 -0xC0(%rcx)
291 prefetcht1 -0x100(%rcx)
292 prefetcht1 -0x140(%rcx)
293 prefetcht1 -0x180(%rcx)
294 prefetcht1 -0x1C0(%rcx)
295 prefetcht1 -0x200(%rcx)
296
297/* Backward loop with unaligned memory access. */
298L(gobble_512bytes_loop_bkw):
299 vmovups -0x40(%rcx), %zmm0
300 vmovups -0x80(%rcx), %zmm1
301 vmovups -0xC0(%rcx), %zmm2
302 vmovups -0x100(%rcx), %zmm3
303 vmovups -0x140(%rcx), %zmm4
304 vmovups -0x180(%rcx), %zmm5
305 vmovups -0x1C0(%rcx), %zmm6
306 vmovups -0x200(%rcx), %zmm7
307 sub $512, %rcx
308 prefetcht1 -0x40(%rcx)
309 prefetcht1 -0x80(%rcx)
310 prefetcht1 -0xC0(%rcx)
311 prefetcht1 -0x100(%rcx)
312 prefetcht1 -0x140(%rcx)
313 prefetcht1 -0x180(%rcx)
314 prefetcht1 -0x1C0(%rcx)
315 prefetcht1 -0x200(%rcx)
316 vmovups %zmm0, -0x40(%r9)
317 vmovups %zmm1, -0x80(%r9)
318 vmovups %zmm2, -0xC0(%r9)
319 vmovups %zmm3, -0x100(%r9)
320 vmovups %zmm4, -0x140(%r9)
321 vmovups %zmm5, -0x180(%r9)
322 vmovups %zmm6, -0x1C0(%r9)
323 vmovups %zmm7, -0x200(%r9)
324 sub $512, %r9
325 cmp %rdi, %r9
326 ja L(gobble_512bytes_loop_bkw)
327 vmovups %zmm8, -0x40(%rdi)
328 vmovups %zmm9, -0x80(%rdi)
329 vmovups %zmm10, -0xC0(%rdi)
330 vmovups %zmm11, -0x100(%rdi)
331 vmovups %zmm12, -0x140(%rdi)
332 vmovups %zmm13, -0x180(%rdi)
333 vmovups %zmm14, -0x1C0(%rdi)
334 vmovups %zmm15, -0x200(%rdi)
335 ret
336
337L(preloop_large):
338 cmp %rsi, %rdi
339 ja L(preloop_large_bkw)
340 vmovups (%rsi), %zmm4
341 vmovups 0x40(%rsi), %zmm5
342
343 mov %rdi, %r11
344/* Align destination for access with non-temporal stores in the loop. */
345 mov %rdi, %r8
346 and $-0x80, %rdi
347 add $0x80, %rdi
348 sub %rdi, %r8
349 sub %r8, %rsi
350 add %r8, %rdx
351L(gobble_256bytes_nt_loop):
352 prefetcht1 0x200(%rsi)
353 prefetcht1 0x240(%rsi)
354 prefetcht1 0x280(%rsi)
355 prefetcht1 0x2C0(%rsi)
356 prefetcht1 0x300(%rsi)
357 prefetcht1 0x340(%rsi)
358 prefetcht1 0x380(%rsi)
359 prefetcht1 0x3C0(%rsi)
360 vmovdqu64 (%rsi), %zmm0
361 vmovdqu64 0x40(%rsi), %zmm1
362 vmovdqu64 0x80(%rsi), %zmm2
363 vmovdqu64 0xC0(%rsi), %zmm3
364 vmovntdq %zmm0, (%rdi)
365 vmovntdq %zmm1, 0x40(%rdi)
366 vmovntdq %zmm2, 0x80(%rdi)
367 vmovntdq %zmm3, 0xC0(%rdi)
368 sub $256, %rdx
369 add $256, %rsi
370 add $256, %rdi
371 cmp $256, %rdx
372 ja L(gobble_256bytes_nt_loop)
373 sfence
374 vmovups %zmm4, (%r11)
375 vmovups %zmm5, 0x40(%r11)
376 jmp L(check)
377
378L(preloop_large_bkw):
379 vmovups -0x80(%rcx), %zmm4
380 vmovups -0x40(%rcx), %zmm5
381
382/* Align end of destination for access with non-temporal stores. */
383 mov %r9, %r8
384 and $-0x80, %r9
385 sub %r9, %r8
386 sub %r8, %rcx
387 sub %r8, %rdx
388 add %r9, %r8
389L(gobble_256bytes_nt_loop_bkw):
390 prefetcht1 -0x400(%rcx)
391 prefetcht1 -0x3C0(%rcx)
392 prefetcht1 -0x380(%rcx)
393 prefetcht1 -0x340(%rcx)
394 prefetcht1 -0x300(%rcx)
395 prefetcht1 -0x2C0(%rcx)
396 prefetcht1 -0x280(%rcx)
397 prefetcht1 -0x240(%rcx)
398 vmovdqu64 -0x100(%rcx), %zmm0
399 vmovdqu64 -0xC0(%rcx), %zmm1
400 vmovdqu64 -0x80(%rcx), %zmm2
401 vmovdqu64 -0x40(%rcx), %zmm3
402 vmovntdq %zmm0, -0x100(%r9)
403 vmovntdq %zmm1, -0xC0(%r9)
404 vmovntdq %zmm2, -0x80(%r9)
405 vmovntdq %zmm3, -0x40(%r9)
406 sub $256, %rdx
407 sub $256, %rcx
408 sub $256, %r9
409 cmp $256, %rdx
410 ja L(gobble_256bytes_nt_loop_bkw)
411 sfence
412 vmovups %zmm4, -0x80(%r8)
413 vmovups %zmm5, -0x40(%r8)
414 jmp L(check)
415END (__memmove_avx512_no_vzeroupper)
416
417strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
418strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
419#endif
420

source code of glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S