1/* memcpy with SSSE3
2 Copyright (C) 2010-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#if IS_IN (libc)
22
23#include "asm-syntax.h"
24
25#ifndef MEMCPY
26# define MEMCPY __memcpy_ssse3
27# define MEMCPY_CHK __memcpy_chk_ssse3
28# define MEMPCPY __mempcpy_ssse3
29# define MEMPCPY_CHK __mempcpy_chk_ssse3
30#endif
31
32#define JMPTBL(I, B) I - B
33
34/* Branch to an entry in a jump table. TABLE is a jump table with
35 relative offsets. INDEX is a register contains the index into the
36 jump table. SCALE is the scale of INDEX. */
37#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
38 lea TABLE(%rip), %r11; \
39 movslq (%r11, INDEX, SCALE), INDEX; \
40 lea (%r11, INDEX), INDEX; \
41 _CET_NOTRACK jmp *INDEX; \
42 ud2
43
44 .section .text.ssse3,"ax",@progbits
45#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
46ENTRY (MEMPCPY_CHK)
47 cmp %RDX_LP, %RCX_LP
48 jb HIDDEN_JUMPTARGET (__chk_fail)
49END (MEMPCPY_CHK)
50
51ENTRY (MEMPCPY)
52 mov %RDI_LP, %RAX_LP
53 add %RDX_LP, %RAX_LP
54 jmp L(start)
55END (MEMPCPY)
56#endif
57
58#if !defined USE_AS_BCOPY
59ENTRY (MEMCPY_CHK)
60 cmp %RDX_LP, %RCX_LP
61 jb HIDDEN_JUMPTARGET (__chk_fail)
62END (MEMCPY_CHK)
63#endif
64
65ENTRY (MEMCPY)
66 mov %RDI_LP, %RAX_LP
67#ifdef USE_AS_MEMPCPY
68 add %RDX_LP, %RAX_LP
69#endif
70
71#ifdef __ILP32__
72 /* Clear the upper 32 bits. */
73 mov %edx, %edx
74#endif
75
76#ifdef USE_AS_MEMMOVE
77 cmp %rsi, %rdi
78 jb L(copy_forward)
79 je L(write_0bytes)
80 cmp $79, %rdx
81 jbe L(copy_forward)
82 jmp L(copy_backward)
83L(copy_forward):
84#endif
85L(start):
86 cmp $79, %rdx
87 lea L(table_less_80bytes)(%rip), %r11
88 ja L(80bytesormore)
89 movslq (%r11, %rdx, 4), %r9
90 add %rdx, %rsi
91 add %rdx, %rdi
92 add %r11, %r9
93 _CET_NOTRACK jmp *%r9
94 ud2
95
96 .p2align 4
97L(80bytesormore):
98#ifndef USE_AS_MEMMOVE
99 cmp %dil, %sil
100 jle L(copy_backward)
101#endif
102
103 movdqu (%rsi), %xmm0
104 mov %rdi, %rcx
105 and $-16, %rdi
106 add $16, %rdi
107 mov %rcx, %r8
108 sub %rdi, %rcx
109 add %rcx, %rdx
110 sub %rcx, %rsi
111
112#ifdef SHARED_CACHE_SIZE_HALF
113 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
114#else
115 mov __x86_shared_cache_size_half(%rip), %RCX_LP
116#endif
117 cmp %rcx, %rdx
118 mov %rsi, %r9
119 ja L(large_page_fwd)
120 and $0xf, %r9
121 jz L(shl_0)
122#ifdef DATA_CACHE_SIZE_HALF
123 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
124#else
125 mov __x86_data_cache_size_half(%rip), %RCX_LP
126#endif
127 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
128
129 .p2align 4
130L(copy_backward):
131 movdqu -16(%rsi, %rdx), %xmm0
132 add %rdx, %rsi
133 lea -16(%rdi, %rdx), %r8
134 add %rdx, %rdi
135
136 mov %rdi, %rcx
137 and $0xf, %rcx
138 xor %rcx, %rdi
139 sub %rcx, %rdx
140 sub %rcx, %rsi
141
142#ifdef SHARED_CACHE_SIZE_HALF
143 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
144#else
145 mov __x86_shared_cache_size_half(%rip), %RCX_LP
146#endif
147
148 cmp %rcx, %rdx
149 mov %rsi, %r9
150 ja L(large_page_bwd)
151 and $0xf, %r9
152 jz L(shl_0_bwd)
153#ifdef DATA_CACHE_SIZE_HALF
154 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
155#else
156 mov __x86_data_cache_size_half(%rip), %RCX_LP
157#endif
158 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
159
160 .p2align 4
161L(shl_0):
162 sub $16, %rdx
163 movdqa (%rsi), %xmm1
164 add $16, %rsi
165 movdqa %xmm1, (%rdi)
166 add $16, %rdi
167 cmp $128, %rdx
168 movdqu %xmm0, (%r8)
169 ja L(shl_0_gobble)
170 cmp $64, %rdx
171 jb L(shl_0_less_64bytes)
172 movaps (%rsi), %xmm4
173 movaps 16(%rsi), %xmm1
174 movaps 32(%rsi), %xmm2
175 movaps 48(%rsi), %xmm3
176 movaps %xmm4, (%rdi)
177 movaps %xmm1, 16(%rdi)
178 movaps %xmm2, 32(%rdi)
179 movaps %xmm3, 48(%rdi)
180 sub $64, %rdx
181 add $64, %rsi
182 add $64, %rdi
183L(shl_0_less_64bytes):
184 add %rdx, %rsi
185 add %rdx, %rdi
186 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
187
188 .p2align 4
189L(shl_0_gobble):
190#ifdef DATA_CACHE_SIZE_HALF
191 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
192#else
193 cmp __x86_data_cache_size_half(%rip), %RDX_LP
194#endif
195 lea -128(%rdx), %rdx
196 jae L(shl_0_gobble_mem_loop)
197L(shl_0_gobble_cache_loop):
198 movdqa (%rsi), %xmm4
199 movaps 0x10(%rsi), %xmm1
200 movaps 0x20(%rsi), %xmm2
201 movaps 0x30(%rsi), %xmm3
202
203 movdqa %xmm4, (%rdi)
204 movaps %xmm1, 0x10(%rdi)
205 movaps %xmm2, 0x20(%rdi)
206 movaps %xmm3, 0x30(%rdi)
207
208 sub $128, %rdx
209 movaps 0x40(%rsi), %xmm4
210 movaps 0x50(%rsi), %xmm5
211 movaps 0x60(%rsi), %xmm6
212 movaps 0x70(%rsi), %xmm7
213 lea 0x80(%rsi), %rsi
214 movaps %xmm4, 0x40(%rdi)
215 movaps %xmm5, 0x50(%rdi)
216 movaps %xmm6, 0x60(%rdi)
217 movaps %xmm7, 0x70(%rdi)
218 lea 0x80(%rdi), %rdi
219
220 jae L(shl_0_gobble_cache_loop)
221 cmp $-0x40, %rdx
222 lea 0x80(%rdx), %rdx
223 jl L(shl_0_cache_less_64bytes)
224
225 movdqa (%rsi), %xmm4
226 sub $0x40, %rdx
227 movdqa 0x10(%rsi), %xmm1
228
229 movdqa %xmm4, (%rdi)
230 movdqa %xmm1, 0x10(%rdi)
231
232 movdqa 0x20(%rsi), %xmm4
233 movdqa 0x30(%rsi), %xmm1
234 add $0x40, %rsi
235
236 movdqa %xmm4, 0x20(%rdi)
237 movdqa %xmm1, 0x30(%rdi)
238 add $0x40, %rdi
239L(shl_0_cache_less_64bytes):
240 add %rdx, %rsi
241 add %rdx, %rdi
242 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
243
244 .p2align 4
245L(shl_0_gobble_mem_loop):
246 prefetcht0 0x1c0(%rsi)
247 prefetcht0 0x280(%rsi)
248
249 movdqa (%rsi), %xmm0
250 movdqa 0x10(%rsi), %xmm1
251 movdqa 0x20(%rsi), %xmm2
252 movdqa 0x30(%rsi), %xmm3
253 movdqa 0x40(%rsi), %xmm4
254 movdqa 0x50(%rsi), %xmm5
255 movdqa 0x60(%rsi), %xmm6
256 movdqa 0x70(%rsi), %xmm7
257 lea 0x80(%rsi), %rsi
258 sub $0x80, %rdx
259 movdqa %xmm0, (%rdi)
260 movdqa %xmm1, 0x10(%rdi)
261 movdqa %xmm2, 0x20(%rdi)
262 movdqa %xmm3, 0x30(%rdi)
263 movdqa %xmm4, 0x40(%rdi)
264 movdqa %xmm5, 0x50(%rdi)
265 movdqa %xmm6, 0x60(%rdi)
266 movdqa %xmm7, 0x70(%rdi)
267 lea 0x80(%rdi), %rdi
268
269 jae L(shl_0_gobble_mem_loop)
270 cmp $-0x40, %rdx
271 lea 0x80(%rdx), %rdx
272 jl L(shl_0_mem_less_64bytes)
273
274 movdqa (%rsi), %xmm0
275 sub $0x40, %rdx
276 movdqa 0x10(%rsi), %xmm1
277
278 movdqa %xmm0, (%rdi)
279 movdqa %xmm1, 0x10(%rdi)
280
281 movdqa 0x20(%rsi), %xmm0
282 movdqa 0x30(%rsi), %xmm1
283 add $0x40, %rsi
284
285 movdqa %xmm0, 0x20(%rdi)
286 movdqa %xmm1, 0x30(%rdi)
287 add $0x40, %rdi
288L(shl_0_mem_less_64bytes):
289 cmp $0x20, %rdx
290 jb L(shl_0_mem_less_32bytes)
291 movdqa (%rsi), %xmm0
292 sub $0x20, %rdx
293 movdqa 0x10(%rsi), %xmm1
294 add $0x20, %rsi
295 movdqa %xmm0, (%rdi)
296 movdqa %xmm1, 0x10(%rdi)
297 add $0x20, %rdi
298L(shl_0_mem_less_32bytes):
299 add %rdx, %rdi
300 add %rdx, %rsi
301 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
302
303 .p2align 4
304L(shl_0_bwd):
305 sub $16, %rdx
306 movdqa -0x10(%rsi), %xmm1
307 sub $16, %rsi
308 movdqa %xmm1, -0x10(%rdi)
309 sub $16, %rdi
310 cmp $0x80, %rdx
311 movdqu %xmm0, (%r8)
312 ja L(shl_0_gobble_bwd)
313 cmp $64, %rdx
314 jb L(shl_0_less_64bytes_bwd)
315 movaps -0x10(%rsi), %xmm0
316 movaps -0x20(%rsi), %xmm1
317 movaps -0x30(%rsi), %xmm2
318 movaps -0x40(%rsi), %xmm3
319 movaps %xmm0, -0x10(%rdi)
320 movaps %xmm1, -0x20(%rdi)
321 movaps %xmm2, -0x30(%rdi)
322 movaps %xmm3, -0x40(%rdi)
323 sub $64, %rdx
324 sub $0x40, %rsi
325 sub $0x40, %rdi
326L(shl_0_less_64bytes_bwd):
327 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
328
329 .p2align 4
330L(shl_0_gobble_bwd):
331#ifdef DATA_CACHE_SIZE_HALF
332 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
333#else
334 cmp __x86_data_cache_size_half(%rip), %RDX_LP
335#endif
336 lea -128(%rdx), %rdx
337 jae L(shl_0_gobble_mem_bwd_loop)
338L(shl_0_gobble_bwd_loop):
339 movdqa -0x10(%rsi), %xmm0
340 movaps -0x20(%rsi), %xmm1
341 movaps -0x30(%rsi), %xmm2
342 movaps -0x40(%rsi), %xmm3
343
344 movdqa %xmm0, -0x10(%rdi)
345 movaps %xmm1, -0x20(%rdi)
346 movaps %xmm2, -0x30(%rdi)
347 movaps %xmm3, -0x40(%rdi)
348
349 sub $0x80, %rdx
350 movaps -0x50(%rsi), %xmm4
351 movaps -0x60(%rsi), %xmm5
352 movaps -0x70(%rsi), %xmm6
353 movaps -0x80(%rsi), %xmm7
354 lea -0x80(%rsi), %rsi
355 movaps %xmm4, -0x50(%rdi)
356 movaps %xmm5, -0x60(%rdi)
357 movaps %xmm6, -0x70(%rdi)
358 movaps %xmm7, -0x80(%rdi)
359 lea -0x80(%rdi), %rdi
360
361 jae L(shl_0_gobble_bwd_loop)
362 cmp $-0x40, %rdx
363 lea 0x80(%rdx), %rdx
364 jl L(shl_0_gobble_bwd_less_64bytes)
365
366 movdqa -0x10(%rsi), %xmm0
367 sub $0x40, %rdx
368 movdqa -0x20(%rsi), %xmm1
369
370 movdqa %xmm0, -0x10(%rdi)
371 movdqa %xmm1, -0x20(%rdi)
372
373 movdqa -0x30(%rsi), %xmm0
374 movdqa -0x40(%rsi), %xmm1
375 sub $0x40, %rsi
376
377 movdqa %xmm0, -0x30(%rdi)
378 movdqa %xmm1, -0x40(%rdi)
379 sub $0x40, %rdi
380L(shl_0_gobble_bwd_less_64bytes):
381 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
382
383 .p2align 4
384L(shl_0_gobble_mem_bwd_loop):
385 prefetcht0 -0x1c0(%rsi)
386 prefetcht0 -0x280(%rsi)
387 movdqa -0x10(%rsi), %xmm0
388 movdqa -0x20(%rsi), %xmm1
389 movdqa -0x30(%rsi), %xmm2
390 movdqa -0x40(%rsi), %xmm3
391 movdqa -0x50(%rsi), %xmm4
392 movdqa -0x60(%rsi), %xmm5
393 movdqa -0x70(%rsi), %xmm6
394 movdqa -0x80(%rsi), %xmm7
395 lea -0x80(%rsi), %rsi
396 sub $0x80, %rdx
397 movdqa %xmm0, -0x10(%rdi)
398 movdqa %xmm1, -0x20(%rdi)
399 movdqa %xmm2, -0x30(%rdi)
400 movdqa %xmm3, -0x40(%rdi)
401 movdqa %xmm4, -0x50(%rdi)
402 movdqa %xmm5, -0x60(%rdi)
403 movdqa %xmm6, -0x70(%rdi)
404 movdqa %xmm7, -0x80(%rdi)
405 lea -0x80(%rdi), %rdi
406
407 jae L(shl_0_gobble_mem_bwd_loop)
408 cmp $-0x40, %rdx
409 lea 0x80(%rdx), %rdx
410 jl L(shl_0_mem_bwd_less_64bytes)
411
412 movdqa -0x10(%rsi), %xmm0
413 sub $0x40, %rdx
414 movdqa -0x20(%rsi), %xmm1
415
416 movdqa %xmm0, -0x10(%rdi)
417 movdqa %xmm1, -0x20(%rdi)
418
419 movdqa -0x30(%rsi), %xmm0
420 movdqa -0x40(%rsi), %xmm1
421 sub $0x40, %rsi
422
423 movdqa %xmm0, -0x30(%rdi)
424 movdqa %xmm1, -0x40(%rdi)
425 sub $0x40, %rdi
426L(shl_0_mem_bwd_less_64bytes):
427 cmp $0x20, %rdx
428 jb L(shl_0_mem_bwd_less_32bytes)
429 movdqa -0x10(%rsi), %xmm0
430 sub $0x20, %rdx
431 movdqa -0x20(%rsi), %xmm1
432 sub $0x20, %rsi
433 movdqa %xmm0, -0x10(%rdi)
434 movdqa %xmm1, -0x20(%rdi)
435 sub $0x20, %rdi
436L(shl_0_mem_bwd_less_32bytes):
437 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
438
439 .p2align 4
440L(shl_1):
441 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
442 cmp %rcx, %rdx
443 movaps -0x01(%rsi), %xmm1
444 jb L(L1_fwd)
445 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
446L(L1_fwd):
447 lea -64(%rdx), %rdx
448 _CET_NOTRACK jmp *%r9
449 ud2
450L(shl_1_loop_L2):
451 prefetchnta 0x1c0(%rsi)
452L(shl_1_loop_L1):
453 sub $64, %rdx
454 movaps 0x0f(%rsi), %xmm2
455 movaps 0x1f(%rsi), %xmm3
456 movaps 0x2f(%rsi), %xmm4
457 movaps 0x3f(%rsi), %xmm5
458 movdqa %xmm5, %xmm6
459 palignr $1, %xmm4, %xmm5
460 lea 64(%rsi), %rsi
461 palignr $1, %xmm3, %xmm4
462 palignr $1, %xmm2, %xmm3
463 lea 64(%rdi), %rdi
464 palignr $1, %xmm1, %xmm2
465 movdqa %xmm6, %xmm1
466 movdqa %xmm2, -0x40(%rdi)
467 movaps %xmm3, -0x30(%rdi)
468 jb L(shl_1_end)
469 movaps %xmm4, -0x20(%rdi)
470 movaps %xmm5, -0x10(%rdi)
471 _CET_NOTRACK jmp *%r9
472 ud2
473L(shl_1_end):
474 movaps %xmm4, -0x20(%rdi)
475 lea 64(%rdx), %rdx
476 movaps %xmm5, -0x10(%rdi)
477 add %rdx, %rdi
478 movdqu %xmm0, (%r8)
479 add %rdx, %rsi
480 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
481
482 .p2align 4
483L(shl_1_bwd):
484 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
485 cmp %rcx, %rdx
486 movaps -0x01(%rsi), %xmm1
487 jb L(L1_bwd)
488 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
489L(L1_bwd):
490 lea -64(%rdx), %rdx
491 _CET_NOTRACK jmp *%r9
492 ud2
493L(shl_1_bwd_loop_L2):
494 prefetchnta -0x1c0(%rsi)
495L(shl_1_bwd_loop_L1):
496 movaps -0x11(%rsi), %xmm2
497 sub $0x40, %rdx
498 movaps -0x21(%rsi), %xmm3
499 movaps -0x31(%rsi), %xmm4
500 movaps -0x41(%rsi), %xmm5
501 lea -0x40(%rsi), %rsi
502 palignr $1, %xmm2, %xmm1
503 palignr $1, %xmm3, %xmm2
504 palignr $1, %xmm4, %xmm3
505 palignr $1, %xmm5, %xmm4
506
507 movaps %xmm1, -0x10(%rdi)
508 movaps %xmm5, %xmm1
509
510 movaps %xmm2, -0x20(%rdi)
511 lea -0x40(%rdi), %rdi
512
513 movaps %xmm3, 0x10(%rdi)
514 jb L(shl_1_bwd_end)
515 movaps %xmm4, (%rdi)
516 _CET_NOTRACK jmp *%r9
517 ud2
518L(shl_1_bwd_end):
519 movaps %xmm4, (%rdi)
520 lea 64(%rdx), %rdx
521 movdqu %xmm0, (%r8)
522 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
523
524 .p2align 4
525L(shl_2):
526 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
527 cmp %rcx, %rdx
528 movaps -0x02(%rsi), %xmm1
529 jb L(L2_fwd)
530 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
531L(L2_fwd):
532 lea -64(%rdx), %rdx
533 _CET_NOTRACK jmp *%r9
534 ud2
535L(shl_2_loop_L2):
536 prefetchnta 0x1c0(%rsi)
537L(shl_2_loop_L1):
538 sub $64, %rdx
539 movaps 0x0e(%rsi), %xmm2
540 movaps 0x1e(%rsi), %xmm3
541 movaps 0x2e(%rsi), %xmm4
542 movaps 0x3e(%rsi), %xmm5
543 movdqa %xmm5, %xmm6
544 palignr $2, %xmm4, %xmm5
545 lea 64(%rsi), %rsi
546 palignr $2, %xmm3, %xmm4
547 palignr $2, %xmm2, %xmm3
548 lea 64(%rdi), %rdi
549 palignr $2, %xmm1, %xmm2
550 movdqa %xmm6, %xmm1
551 movdqa %xmm2, -0x40(%rdi)
552 movaps %xmm3, -0x30(%rdi)
553 jb L(shl_2_end)
554 movaps %xmm4, -0x20(%rdi)
555 movaps %xmm5, -0x10(%rdi)
556 _CET_NOTRACK jmp *%r9
557 ud2
558L(shl_2_end):
559 movaps %xmm4, -0x20(%rdi)
560 lea 64(%rdx), %rdx
561 movaps %xmm5, -0x10(%rdi)
562 add %rdx, %rdi
563 movdqu %xmm0, (%r8)
564 add %rdx, %rsi
565 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
566
567 .p2align 4
568L(shl_2_bwd):
569 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
570 cmp %rcx, %rdx
571 movaps -0x02(%rsi), %xmm1
572 jb L(L2_bwd)
573 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
574L(L2_bwd):
575 lea -64(%rdx), %rdx
576 _CET_NOTRACK jmp *%r9
577 ud2
578L(shl_2_bwd_loop_L2):
579 prefetchnta -0x1c0(%rsi)
580L(shl_2_bwd_loop_L1):
581 movaps -0x12(%rsi), %xmm2
582 sub $0x40, %rdx
583 movaps -0x22(%rsi), %xmm3
584 movaps -0x32(%rsi), %xmm4
585 movaps -0x42(%rsi), %xmm5
586 lea -0x40(%rsi), %rsi
587 palignr $2, %xmm2, %xmm1
588 palignr $2, %xmm3, %xmm2
589 palignr $2, %xmm4, %xmm3
590 palignr $2, %xmm5, %xmm4
591
592 movaps %xmm1, -0x10(%rdi)
593 movaps %xmm5, %xmm1
594
595 movaps %xmm2, -0x20(%rdi)
596 lea -0x40(%rdi), %rdi
597
598 movaps %xmm3, 0x10(%rdi)
599 jb L(shl_2_bwd_end)
600 movaps %xmm4, (%rdi)
601 _CET_NOTRACK jmp *%r9
602 ud2
603L(shl_2_bwd_end):
604 movaps %xmm4, (%rdi)
605 lea 64(%rdx), %rdx
606 movdqu %xmm0, (%r8)
607 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
608
609 .p2align 4
610L(shl_3):
611 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
612 cmp %rcx, %rdx
613 movaps -0x03(%rsi), %xmm1
614 jb L(L3_fwd)
615 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
616L(L3_fwd):
617 lea -64(%rdx), %rdx
618 _CET_NOTRACK jmp *%r9
619 ud2
620L(shl_3_loop_L2):
621 prefetchnta 0x1c0(%rsi)
622L(shl_3_loop_L1):
623 sub $64, %rdx
624 movaps 0x0d(%rsi), %xmm2
625 movaps 0x1d(%rsi), %xmm3
626 movaps 0x2d(%rsi), %xmm4
627 movaps 0x3d(%rsi), %xmm5
628 movdqa %xmm5, %xmm6
629 palignr $3, %xmm4, %xmm5
630 lea 64(%rsi), %rsi
631 palignr $3, %xmm3, %xmm4
632 palignr $3, %xmm2, %xmm3
633 lea 64(%rdi), %rdi
634 palignr $3, %xmm1, %xmm2
635 movdqa %xmm6, %xmm1
636 movdqa %xmm2, -0x40(%rdi)
637 movaps %xmm3, -0x30(%rdi)
638 jb L(shl_3_end)
639 movaps %xmm4, -0x20(%rdi)
640 movaps %xmm5, -0x10(%rdi)
641 _CET_NOTRACK jmp *%r9
642 ud2
643L(shl_3_end):
644 movaps %xmm4, -0x20(%rdi)
645 lea 64(%rdx), %rdx
646 movaps %xmm5, -0x10(%rdi)
647 add %rdx, %rdi
648 movdqu %xmm0, (%r8)
649 add %rdx, %rsi
650 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
651
652 .p2align 4
653L(shl_3_bwd):
654 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
655 cmp %rcx, %rdx
656 movaps -0x03(%rsi), %xmm1
657 jb L(L3_bwd)
658 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
659L(L3_bwd):
660 lea -64(%rdx), %rdx
661 _CET_NOTRACK jmp *%r9
662 ud2
663L(shl_3_bwd_loop_L2):
664 prefetchnta -0x1c0(%rsi)
665L(shl_3_bwd_loop_L1):
666 movaps -0x13(%rsi), %xmm2
667 sub $0x40, %rdx
668 movaps -0x23(%rsi), %xmm3
669 movaps -0x33(%rsi), %xmm4
670 movaps -0x43(%rsi), %xmm5
671 lea -0x40(%rsi), %rsi
672 palignr $3, %xmm2, %xmm1
673 palignr $3, %xmm3, %xmm2
674 palignr $3, %xmm4, %xmm3
675 palignr $3, %xmm5, %xmm4
676
677 movaps %xmm1, -0x10(%rdi)
678 movaps %xmm5, %xmm1
679
680 movaps %xmm2, -0x20(%rdi)
681 lea -0x40(%rdi), %rdi
682
683 movaps %xmm3, 0x10(%rdi)
684 jb L(shl_3_bwd_end)
685 movaps %xmm4, (%rdi)
686 _CET_NOTRACK jmp *%r9
687 ud2
688L(shl_3_bwd_end):
689 movaps %xmm4, (%rdi)
690 lea 64(%rdx), %rdx
691 movdqu %xmm0, (%r8)
692 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
693
694 .p2align 4
695L(shl_4):
696 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
697 cmp %rcx, %rdx
698 movaps -0x04(%rsi), %xmm1
699 jb L(L4_fwd)
700 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
701L(L4_fwd):
702 lea -64(%rdx), %rdx
703 _CET_NOTRACK jmp *%r9
704 ud2
705L(shl_4_loop_L2):
706 prefetchnta 0x1c0(%rsi)
707L(shl_4_loop_L1):
708 sub $64, %rdx
709 movaps 0x0c(%rsi), %xmm2
710 movaps 0x1c(%rsi), %xmm3
711 movaps 0x2c(%rsi), %xmm4
712 movaps 0x3c(%rsi), %xmm5
713 movdqa %xmm5, %xmm6
714 palignr $4, %xmm4, %xmm5
715 lea 64(%rsi), %rsi
716 palignr $4, %xmm3, %xmm4
717 palignr $4, %xmm2, %xmm3
718 lea 64(%rdi), %rdi
719 palignr $4, %xmm1, %xmm2
720 movdqa %xmm6, %xmm1
721 movdqa %xmm2, -0x40(%rdi)
722 movaps %xmm3, -0x30(%rdi)
723 jb L(shl_4_end)
724 movaps %xmm4, -0x20(%rdi)
725 movaps %xmm5, -0x10(%rdi)
726 _CET_NOTRACK jmp *%r9
727 ud2
728L(shl_4_end):
729 movaps %xmm4, -0x20(%rdi)
730 lea 64(%rdx), %rdx
731 movaps %xmm5, -0x10(%rdi)
732 add %rdx, %rdi
733 movdqu %xmm0, (%r8)
734 add %rdx, %rsi
735 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
736
737 .p2align 4
738L(shl_4_bwd):
739 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
740 cmp %rcx, %rdx
741 movaps -0x04(%rsi), %xmm1
742 jb L(L4_bwd)
743 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
744L(L4_bwd):
745 lea -64(%rdx), %rdx
746 _CET_NOTRACK jmp *%r9
747 ud2
748L(shl_4_bwd_loop_L2):
749 prefetchnta -0x1c0(%rsi)
750L(shl_4_bwd_loop_L1):
751 movaps -0x14(%rsi), %xmm2
752 sub $0x40, %rdx
753 movaps -0x24(%rsi), %xmm3
754 movaps -0x34(%rsi), %xmm4
755 movaps -0x44(%rsi), %xmm5
756 lea -0x40(%rsi), %rsi
757 palignr $4, %xmm2, %xmm1
758 palignr $4, %xmm3, %xmm2
759 palignr $4, %xmm4, %xmm3
760 palignr $4, %xmm5, %xmm4
761
762 movaps %xmm1, -0x10(%rdi)
763 movaps %xmm5, %xmm1
764
765 movaps %xmm2, -0x20(%rdi)
766 lea -0x40(%rdi), %rdi
767
768 movaps %xmm3, 0x10(%rdi)
769 jb L(shl_4_bwd_end)
770 movaps %xmm4, (%rdi)
771 _CET_NOTRACK jmp *%r9
772 ud2
773L(shl_4_bwd_end):
774 movaps %xmm4, (%rdi)
775 lea 64(%rdx), %rdx
776 movdqu %xmm0, (%r8)
777 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
778
779 .p2align 4
780L(shl_5):
781 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
782 cmp %rcx, %rdx
783 movaps -0x05(%rsi), %xmm1
784 jb L(L5_fwd)
785 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
786L(L5_fwd):
787 lea -64(%rdx), %rdx
788 _CET_NOTRACK jmp *%r9
789 ud2
790L(shl_5_loop_L2):
791 prefetchnta 0x1c0(%rsi)
792L(shl_5_loop_L1):
793 sub $64, %rdx
794 movaps 0x0b(%rsi), %xmm2
795 movaps 0x1b(%rsi), %xmm3
796 movaps 0x2b(%rsi), %xmm4
797 movaps 0x3b(%rsi), %xmm5
798 movdqa %xmm5, %xmm6
799 palignr $5, %xmm4, %xmm5
800 lea 64(%rsi), %rsi
801 palignr $5, %xmm3, %xmm4
802 palignr $5, %xmm2, %xmm3
803 lea 64(%rdi), %rdi
804 palignr $5, %xmm1, %xmm2
805 movdqa %xmm6, %xmm1
806 movdqa %xmm2, -0x40(%rdi)
807 movaps %xmm3, -0x30(%rdi)
808 jb L(shl_5_end)
809 movaps %xmm4, -0x20(%rdi)
810 movaps %xmm5, -0x10(%rdi)
811 _CET_NOTRACK jmp *%r9
812 ud2
813L(shl_5_end):
814 movaps %xmm4, -0x20(%rdi)
815 lea 64(%rdx), %rdx
816 movaps %xmm5, -0x10(%rdi)
817 add %rdx, %rdi
818 movdqu %xmm0, (%r8)
819 add %rdx, %rsi
820 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
821
822 .p2align 4
823L(shl_5_bwd):
824 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
825 cmp %rcx, %rdx
826 movaps -0x05(%rsi), %xmm1
827 jb L(L5_bwd)
828 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
829L(L5_bwd):
830 lea -64(%rdx), %rdx
831 _CET_NOTRACK jmp *%r9
832 ud2
833L(shl_5_bwd_loop_L2):
834 prefetchnta -0x1c0(%rsi)
835L(shl_5_bwd_loop_L1):
836 movaps -0x15(%rsi), %xmm2
837 sub $0x40, %rdx
838 movaps -0x25(%rsi), %xmm3
839 movaps -0x35(%rsi), %xmm4
840 movaps -0x45(%rsi), %xmm5
841 lea -0x40(%rsi), %rsi
842 palignr $5, %xmm2, %xmm1
843 palignr $5, %xmm3, %xmm2
844 palignr $5, %xmm4, %xmm3
845 palignr $5, %xmm5, %xmm4
846
847 movaps %xmm1, -0x10(%rdi)
848 movaps %xmm5, %xmm1
849
850 movaps %xmm2, -0x20(%rdi)
851 lea -0x40(%rdi), %rdi
852
853 movaps %xmm3, 0x10(%rdi)
854 jb L(shl_5_bwd_end)
855 movaps %xmm4, (%rdi)
856 _CET_NOTRACK jmp *%r9
857 ud2
858L(shl_5_bwd_end):
859 movaps %xmm4, (%rdi)
860 lea 64(%rdx), %rdx
861 movdqu %xmm0, (%r8)
862 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
863
864 .p2align 4
865L(shl_6):
866 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
867 cmp %rcx, %rdx
868 movaps -0x06(%rsi), %xmm1
869 jb L(L6_fwd)
870 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
871L(L6_fwd):
872 lea -64(%rdx), %rdx
873 _CET_NOTRACK jmp *%r9
874 ud2
875L(shl_6_loop_L2):
876 prefetchnta 0x1c0(%rsi)
877L(shl_6_loop_L1):
878 sub $64, %rdx
879 movaps 0x0a(%rsi), %xmm2
880 movaps 0x1a(%rsi), %xmm3
881 movaps 0x2a(%rsi), %xmm4
882 movaps 0x3a(%rsi), %xmm5
883 movdqa %xmm5, %xmm6
884 palignr $6, %xmm4, %xmm5
885 lea 64(%rsi), %rsi
886 palignr $6, %xmm3, %xmm4
887 palignr $6, %xmm2, %xmm3
888 lea 64(%rdi), %rdi
889 palignr $6, %xmm1, %xmm2
890 movdqa %xmm6, %xmm1
891 movdqa %xmm2, -0x40(%rdi)
892 movaps %xmm3, -0x30(%rdi)
893 jb L(shl_6_end)
894 movaps %xmm4, -0x20(%rdi)
895 movaps %xmm5, -0x10(%rdi)
896 _CET_NOTRACK jmp *%r9
897 ud2
898L(shl_6_end):
899 movaps %xmm4, -0x20(%rdi)
900 lea 64(%rdx), %rdx
901 movaps %xmm5, -0x10(%rdi)
902 add %rdx, %rdi
903 movdqu %xmm0, (%r8)
904 add %rdx, %rsi
905 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
906
907 .p2align 4
908L(shl_6_bwd):
909 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
910 cmp %rcx, %rdx
911 movaps -0x06(%rsi), %xmm1
912 jb L(L6_bwd)
913 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
914L(L6_bwd):
915 lea -64(%rdx), %rdx
916 _CET_NOTRACK jmp *%r9
917 ud2
918L(shl_6_bwd_loop_L2):
919 prefetchnta -0x1c0(%rsi)
920L(shl_6_bwd_loop_L1):
921 movaps -0x16(%rsi), %xmm2
922 sub $0x40, %rdx
923 movaps -0x26(%rsi), %xmm3
924 movaps -0x36(%rsi), %xmm4
925 movaps -0x46(%rsi), %xmm5
926 lea -0x40(%rsi), %rsi
927 palignr $6, %xmm2, %xmm1
928 palignr $6, %xmm3, %xmm2
929 palignr $6, %xmm4, %xmm3
930 palignr $6, %xmm5, %xmm4
931
932 movaps %xmm1, -0x10(%rdi)
933 movaps %xmm5, %xmm1
934
935 movaps %xmm2, -0x20(%rdi)
936 lea -0x40(%rdi), %rdi
937
938 movaps %xmm3, 0x10(%rdi)
939 jb L(shl_6_bwd_end)
940 movaps %xmm4, (%rdi)
941 _CET_NOTRACK jmp *%r9
942 ud2
943L(shl_6_bwd_end):
944 movaps %xmm4, (%rdi)
945 lea 64(%rdx), %rdx
946 movdqu %xmm0, (%r8)
947 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
948
949 .p2align 4
950L(shl_7):
951 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
952 cmp %rcx, %rdx
953 movaps -0x07(%rsi), %xmm1
954 jb L(L7_fwd)
955 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
956L(L7_fwd):
957 lea -64(%rdx), %rdx
958 _CET_NOTRACK jmp *%r9
959 ud2
960L(shl_7_loop_L2):
961 prefetchnta 0x1c0(%rsi)
962L(shl_7_loop_L1):
963 sub $64, %rdx
964 movaps 0x09(%rsi), %xmm2
965 movaps 0x19(%rsi), %xmm3
966 movaps 0x29(%rsi), %xmm4
967 movaps 0x39(%rsi), %xmm5
968 movdqa %xmm5, %xmm6
969 palignr $7, %xmm4, %xmm5
970 lea 64(%rsi), %rsi
971 palignr $7, %xmm3, %xmm4
972 palignr $7, %xmm2, %xmm3
973 lea 64(%rdi), %rdi
974 palignr $7, %xmm1, %xmm2
975 movdqa %xmm6, %xmm1
976 movdqa %xmm2, -0x40(%rdi)
977 movaps %xmm3, -0x30(%rdi)
978 jb L(shl_7_end)
979 movaps %xmm4, -0x20(%rdi)
980 movaps %xmm5, -0x10(%rdi)
981 _CET_NOTRACK jmp *%r9
982 ud2
983L(shl_7_end):
984 movaps %xmm4, -0x20(%rdi)
985 lea 64(%rdx), %rdx
986 movaps %xmm5, -0x10(%rdi)
987 add %rdx, %rdi
988 movdqu %xmm0, (%r8)
989 add %rdx, %rsi
990 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
991
992 .p2align 4
993L(shl_7_bwd):
994 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
995 cmp %rcx, %rdx
996 movaps -0x07(%rsi), %xmm1
997 jb L(L7_bwd)
998 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
999L(L7_bwd):
1000 lea -64(%rdx), %rdx
1001 _CET_NOTRACK jmp *%r9
1002 ud2
1003L(shl_7_bwd_loop_L2):
1004 prefetchnta -0x1c0(%rsi)
1005L(shl_7_bwd_loop_L1):
1006 movaps -0x17(%rsi), %xmm2
1007 sub $0x40, %rdx
1008 movaps -0x27(%rsi), %xmm3
1009 movaps -0x37(%rsi), %xmm4
1010 movaps -0x47(%rsi), %xmm5
1011 lea -0x40(%rsi), %rsi
1012 palignr $7, %xmm2, %xmm1
1013 palignr $7, %xmm3, %xmm2
1014 palignr $7, %xmm4, %xmm3
1015 palignr $7, %xmm5, %xmm4
1016
1017 movaps %xmm1, -0x10(%rdi)
1018 movaps %xmm5, %xmm1
1019
1020 movaps %xmm2, -0x20(%rdi)
1021 lea -0x40(%rdi), %rdi
1022
1023 movaps %xmm3, 0x10(%rdi)
1024 jb L(shl_7_bwd_end)
1025 movaps %xmm4, (%rdi)
1026 _CET_NOTRACK jmp *%r9
1027 ud2
1028L(shl_7_bwd_end):
1029 movaps %xmm4, (%rdi)
1030 lea 64(%rdx), %rdx
1031 movdqu %xmm0, (%r8)
1032 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1033
1034 .p2align 4
1035L(shl_8):
1036 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1037 cmp %rcx, %rdx
1038 movaps -0x08(%rsi), %xmm1
1039 jb L(L8_fwd)
1040 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1041L(L8_fwd):
1042 lea -64(%rdx), %rdx
1043 _CET_NOTRACK jmp *%r9
1044L(shl_8_loop_L2):
1045 prefetchnta 0x1c0(%rsi)
1046L(shl_8_loop_L1):
1047 sub $64, %rdx
1048 movaps 0x08(%rsi), %xmm2
1049 movaps 0x18(%rsi), %xmm3
1050 movaps 0x28(%rsi), %xmm4
1051 movaps 0x38(%rsi), %xmm5
1052 movdqa %xmm5, %xmm6
1053 palignr $8, %xmm4, %xmm5
1054 lea 64(%rsi), %rsi
1055 palignr $8, %xmm3, %xmm4
1056 palignr $8, %xmm2, %xmm3
1057 lea 64(%rdi), %rdi
1058 palignr $8, %xmm1, %xmm2
1059 movdqa %xmm6, %xmm1
1060 movdqa %xmm2, -0x40(%rdi)
1061 movaps %xmm3, -0x30(%rdi)
1062 jb L(shl_8_end)
1063 movaps %xmm4, -0x20(%rdi)
1064 movaps %xmm5, -0x10(%rdi)
1065 _CET_NOTRACK jmp *%r9
1066 ud2
1067 .p2align 4
1068L(shl_8_end):
1069 lea 64(%rdx), %rdx
1070 movaps %xmm4, -0x20(%rdi)
1071 add %rdx, %rsi
1072 movaps %xmm5, -0x10(%rdi)
1073 add %rdx, %rdi
1074 movdqu %xmm0, (%r8)
1075 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1076
1077 .p2align 4
1078L(shl_8_bwd):
1079 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1080 cmp %rcx, %rdx
1081 movaps -0x08(%rsi), %xmm1
1082 jb L(L8_bwd)
1083 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1084L(L8_bwd):
1085 lea -64(%rdx), %rdx
1086 _CET_NOTRACK jmp *%r9
1087 ud2
1088L(shl_8_bwd_loop_L2):
1089 prefetchnta -0x1c0(%rsi)
1090L(shl_8_bwd_loop_L1):
1091 movaps -0x18(%rsi), %xmm2
1092 sub $0x40, %rdx
1093 movaps -0x28(%rsi), %xmm3
1094 movaps -0x38(%rsi), %xmm4
1095 movaps -0x48(%rsi), %xmm5
1096 lea -0x40(%rsi), %rsi
1097 palignr $8, %xmm2, %xmm1
1098 palignr $8, %xmm3, %xmm2
1099 palignr $8, %xmm4, %xmm3
1100 palignr $8, %xmm5, %xmm4
1101
1102 movaps %xmm1, -0x10(%rdi)
1103 movaps %xmm5, %xmm1
1104
1105 movaps %xmm2, -0x20(%rdi)
1106 lea -0x40(%rdi), %rdi
1107
1108 movaps %xmm3, 0x10(%rdi)
1109 jb L(shl_8_bwd_end)
1110 movaps %xmm4, (%rdi)
1111 _CET_NOTRACK jmp *%r9
1112 ud2
1113L(shl_8_bwd_end):
1114 movaps %xmm4, (%rdi)
1115 lea 64(%rdx), %rdx
1116 movdqu %xmm0, (%r8)
1117 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1118
1119 .p2align 4
1120L(shl_9):
1121 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1122 cmp %rcx, %rdx
1123 movaps -0x09(%rsi), %xmm1
1124 jb L(L9_fwd)
1125 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1126L(L9_fwd):
1127 lea -64(%rdx), %rdx
1128 _CET_NOTRACK jmp *%r9
1129 ud2
1130L(shl_9_loop_L2):
1131 prefetchnta 0x1c0(%rsi)
1132L(shl_9_loop_L1):
1133 sub $64, %rdx
1134 movaps 0x07(%rsi), %xmm2
1135 movaps 0x17(%rsi), %xmm3
1136 movaps 0x27(%rsi), %xmm4
1137 movaps 0x37(%rsi), %xmm5
1138 movdqa %xmm5, %xmm6
1139 palignr $9, %xmm4, %xmm5
1140 lea 64(%rsi), %rsi
1141 palignr $9, %xmm3, %xmm4
1142 palignr $9, %xmm2, %xmm3
1143 lea 64(%rdi), %rdi
1144 palignr $9, %xmm1, %xmm2
1145 movdqa %xmm6, %xmm1
1146 movdqa %xmm2, -0x40(%rdi)
1147 movaps %xmm3, -0x30(%rdi)
1148 jb L(shl_9_end)
1149 movaps %xmm4, -0x20(%rdi)
1150 movaps %xmm5, -0x10(%rdi)
1151 _CET_NOTRACK jmp *%r9
1152 ud2
1153L(shl_9_end):
1154 movaps %xmm4, -0x20(%rdi)
1155 lea 64(%rdx), %rdx
1156 movaps %xmm5, -0x10(%rdi)
1157 add %rdx, %rdi
1158 movdqu %xmm0, (%r8)
1159 add %rdx, %rsi
1160 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1161
1162 .p2align 4
1163L(shl_9_bwd):
1164 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1165 cmp %rcx, %rdx
1166 movaps -0x09(%rsi), %xmm1
1167 jb L(L9_bwd)
1168 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1169L(L9_bwd):
1170 lea -64(%rdx), %rdx
1171 _CET_NOTRACK jmp *%r9
1172 ud2
1173L(shl_9_bwd_loop_L2):
1174 prefetchnta -0x1c0(%rsi)
1175L(shl_9_bwd_loop_L1):
1176 movaps -0x19(%rsi), %xmm2
1177 sub $0x40, %rdx
1178 movaps -0x29(%rsi), %xmm3
1179 movaps -0x39(%rsi), %xmm4
1180 movaps -0x49(%rsi), %xmm5
1181 lea -0x40(%rsi), %rsi
1182 palignr $9, %xmm2, %xmm1
1183 palignr $9, %xmm3, %xmm2
1184 palignr $9, %xmm4, %xmm3
1185 palignr $9, %xmm5, %xmm4
1186
1187 movaps %xmm1, -0x10(%rdi)
1188 movaps %xmm5, %xmm1
1189
1190 movaps %xmm2, -0x20(%rdi)
1191 lea -0x40(%rdi), %rdi
1192
1193 movaps %xmm3, 0x10(%rdi)
1194 jb L(shl_9_bwd_end)
1195 movaps %xmm4, (%rdi)
1196 _CET_NOTRACK jmp *%r9
1197 ud2
1198L(shl_9_bwd_end):
1199 movaps %xmm4, (%rdi)
1200 lea 64(%rdx), %rdx
1201 movdqu %xmm0, (%r8)
1202 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1203
1204 .p2align 4
1205L(shl_10):
1206 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1207 cmp %rcx, %rdx
1208 movaps -0x0a(%rsi), %xmm1
1209 jb L(L10_fwd)
1210 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1211L(L10_fwd):
1212 lea -64(%rdx), %rdx
1213 _CET_NOTRACK jmp *%r9
1214 ud2
1215L(shl_10_loop_L2):
1216 prefetchnta 0x1c0(%rsi)
1217L(shl_10_loop_L1):
1218 sub $64, %rdx
1219 movaps 0x06(%rsi), %xmm2
1220 movaps 0x16(%rsi), %xmm3
1221 movaps 0x26(%rsi), %xmm4
1222 movaps 0x36(%rsi), %xmm5
1223 movdqa %xmm5, %xmm6
1224 palignr $10, %xmm4, %xmm5
1225 lea 64(%rsi), %rsi
1226 palignr $10, %xmm3, %xmm4
1227 palignr $10, %xmm2, %xmm3
1228 lea 64(%rdi), %rdi
1229 palignr $10, %xmm1, %xmm2
1230 movdqa %xmm6, %xmm1
1231 movdqa %xmm2, -0x40(%rdi)
1232 movaps %xmm3, -0x30(%rdi)
1233 jb L(shl_10_end)
1234 movaps %xmm4, -0x20(%rdi)
1235 movaps %xmm5, -0x10(%rdi)
1236 _CET_NOTRACK jmp *%r9
1237 ud2
1238L(shl_10_end):
1239 movaps %xmm4, -0x20(%rdi)
1240 lea 64(%rdx), %rdx
1241 movaps %xmm5, -0x10(%rdi)
1242 add %rdx, %rdi
1243 movdqu %xmm0, (%r8)
1244 add %rdx, %rsi
1245 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1246
1247 .p2align 4
1248L(shl_10_bwd):
1249 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1250 cmp %rcx, %rdx
1251 movaps -0x0a(%rsi), %xmm1
1252 jb L(L10_bwd)
1253 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1254L(L10_bwd):
1255 lea -64(%rdx), %rdx
1256 _CET_NOTRACK jmp *%r9
1257 ud2
1258L(shl_10_bwd_loop_L2):
1259 prefetchnta -0x1c0(%rsi)
1260L(shl_10_bwd_loop_L1):
1261 movaps -0x1a(%rsi), %xmm2
1262 sub $0x40, %rdx
1263 movaps -0x2a(%rsi), %xmm3
1264 movaps -0x3a(%rsi), %xmm4
1265 movaps -0x4a(%rsi), %xmm5
1266 lea -0x40(%rsi), %rsi
1267 palignr $10, %xmm2, %xmm1
1268 palignr $10, %xmm3, %xmm2
1269 palignr $10, %xmm4, %xmm3
1270 palignr $10, %xmm5, %xmm4
1271
1272 movaps %xmm1, -0x10(%rdi)
1273 movaps %xmm5, %xmm1
1274
1275 movaps %xmm2, -0x20(%rdi)
1276 lea -0x40(%rdi), %rdi
1277
1278 movaps %xmm3, 0x10(%rdi)
1279 jb L(shl_10_bwd_end)
1280 movaps %xmm4, (%rdi)
1281 _CET_NOTRACK jmp *%r9
1282 ud2
1283L(shl_10_bwd_end):
1284 movaps %xmm4, (%rdi)
1285 lea 64(%rdx), %rdx
1286 movdqu %xmm0, (%r8)
1287 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1288
1289 .p2align 4
1290L(shl_11):
1291 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1292 cmp %rcx, %rdx
1293 movaps -0x0b(%rsi), %xmm1
1294 jb L(L11_fwd)
1295 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1296L(L11_fwd):
1297 lea -64(%rdx), %rdx
1298 _CET_NOTRACK jmp *%r9
1299 ud2
1300L(shl_11_loop_L2):
1301 prefetchnta 0x1c0(%rsi)
1302L(shl_11_loop_L1):
1303 sub $64, %rdx
1304 movaps 0x05(%rsi), %xmm2
1305 movaps 0x15(%rsi), %xmm3
1306 movaps 0x25(%rsi), %xmm4
1307 movaps 0x35(%rsi), %xmm5
1308 movdqa %xmm5, %xmm6
1309 palignr $11, %xmm4, %xmm5
1310 lea 64(%rsi), %rsi
1311 palignr $11, %xmm3, %xmm4
1312 palignr $11, %xmm2, %xmm3
1313 lea 64(%rdi), %rdi
1314 palignr $11, %xmm1, %xmm2
1315 movdqa %xmm6, %xmm1
1316 movdqa %xmm2, -0x40(%rdi)
1317 movaps %xmm3, -0x30(%rdi)
1318 jb L(shl_11_end)
1319 movaps %xmm4, -0x20(%rdi)
1320 movaps %xmm5, -0x10(%rdi)
1321 _CET_NOTRACK jmp *%r9
1322 ud2
1323L(shl_11_end):
1324 movaps %xmm4, -0x20(%rdi)
1325 lea 64(%rdx), %rdx
1326 movaps %xmm5, -0x10(%rdi)
1327 add %rdx, %rdi
1328 movdqu %xmm0, (%r8)
1329 add %rdx, %rsi
1330 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1331
1332 .p2align 4
1333L(shl_11_bwd):
1334 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1335 cmp %rcx, %rdx
1336 movaps -0x0b(%rsi), %xmm1
1337 jb L(L11_bwd)
1338 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1339L(L11_bwd):
1340 lea -64(%rdx), %rdx
1341 _CET_NOTRACK jmp *%r9
1342 ud2
1343L(shl_11_bwd_loop_L2):
1344 prefetchnta -0x1c0(%rsi)
1345L(shl_11_bwd_loop_L1):
1346 movaps -0x1b(%rsi), %xmm2
1347 sub $0x40, %rdx
1348 movaps -0x2b(%rsi), %xmm3
1349 movaps -0x3b(%rsi), %xmm4
1350 movaps -0x4b(%rsi), %xmm5
1351 lea -0x40(%rsi), %rsi
1352 palignr $11, %xmm2, %xmm1
1353 palignr $11, %xmm3, %xmm2
1354 palignr $11, %xmm4, %xmm3
1355 palignr $11, %xmm5, %xmm4
1356
1357 movaps %xmm1, -0x10(%rdi)
1358 movaps %xmm5, %xmm1
1359
1360 movaps %xmm2, -0x20(%rdi)
1361 lea -0x40(%rdi), %rdi
1362
1363 movaps %xmm3, 0x10(%rdi)
1364 jb L(shl_11_bwd_end)
1365 movaps %xmm4, (%rdi)
1366 _CET_NOTRACK jmp *%r9
1367 ud2
1368L(shl_11_bwd_end):
1369 movaps %xmm4, (%rdi)
1370 lea 64(%rdx), %rdx
1371 movdqu %xmm0, (%r8)
1372 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1373
1374 .p2align 4
1375L(shl_12):
1376 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1377 cmp %rcx, %rdx
1378 movaps -0x0c(%rsi), %xmm1
1379 jb L(L12_fwd)
1380 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1381L(L12_fwd):
1382 lea -64(%rdx), %rdx
1383 _CET_NOTRACK jmp *%r9
1384 ud2
1385L(shl_12_loop_L2):
1386 prefetchnta 0x1c0(%rsi)
1387L(shl_12_loop_L1):
1388 sub $64, %rdx
1389 movaps 0x04(%rsi), %xmm2
1390 movaps 0x14(%rsi), %xmm3
1391 movaps 0x24(%rsi), %xmm4
1392 movaps 0x34(%rsi), %xmm5
1393 movdqa %xmm5, %xmm6
1394 palignr $12, %xmm4, %xmm5
1395 lea 64(%rsi), %rsi
1396 palignr $12, %xmm3, %xmm4
1397 palignr $12, %xmm2, %xmm3
1398 lea 64(%rdi), %rdi
1399 palignr $12, %xmm1, %xmm2
1400 movdqa %xmm6, %xmm1
1401 movdqa %xmm2, -0x40(%rdi)
1402 movaps %xmm3, -0x30(%rdi)
1403 jb L(shl_12_end)
1404 movaps %xmm4, -0x20(%rdi)
1405 movaps %xmm5, -0x10(%rdi)
1406 _CET_NOTRACK jmp *%r9
1407 ud2
1408L(shl_12_end):
1409 movaps %xmm4, -0x20(%rdi)
1410 lea 64(%rdx), %rdx
1411 movaps %xmm5, -0x10(%rdi)
1412 add %rdx, %rdi
1413 movdqu %xmm0, (%r8)
1414 add %rdx, %rsi
1415 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1416
1417 .p2align 4
1418L(shl_12_bwd):
1419 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1420 cmp %rcx, %rdx
1421 movaps -0x0c(%rsi), %xmm1
1422 jb L(L12_bwd)
1423 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1424L(L12_bwd):
1425 lea -64(%rdx), %rdx
1426 _CET_NOTRACK jmp *%r9
1427 ud2
1428L(shl_12_bwd_loop_L2):
1429 prefetchnta -0x1c0(%rsi)
1430L(shl_12_bwd_loop_L1):
1431 movaps -0x1c(%rsi), %xmm2
1432 sub $0x40, %rdx
1433 movaps -0x2c(%rsi), %xmm3
1434 movaps -0x3c(%rsi), %xmm4
1435 movaps -0x4c(%rsi), %xmm5
1436 lea -0x40(%rsi), %rsi
1437 palignr $12, %xmm2, %xmm1
1438 palignr $12, %xmm3, %xmm2
1439 palignr $12, %xmm4, %xmm3
1440 palignr $12, %xmm5, %xmm4
1441
1442 movaps %xmm1, -0x10(%rdi)
1443 movaps %xmm5, %xmm1
1444
1445 movaps %xmm2, -0x20(%rdi)
1446 lea -0x40(%rdi), %rdi
1447
1448 movaps %xmm3, 0x10(%rdi)
1449 jb L(shl_12_bwd_end)
1450 movaps %xmm4, (%rdi)
1451 _CET_NOTRACK jmp *%r9
1452 ud2
1453L(shl_12_bwd_end):
1454 movaps %xmm4, (%rdi)
1455 lea 64(%rdx), %rdx
1456 movdqu %xmm0, (%r8)
1457 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1458
1459 .p2align 4
1460L(shl_13):
1461 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1462 cmp %rcx, %rdx
1463 movaps -0x0d(%rsi), %xmm1
1464 jb L(L13_fwd)
1465 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1466L(L13_fwd):
1467 lea -64(%rdx), %rdx
1468 _CET_NOTRACK jmp *%r9
1469 ud2
1470L(shl_13_loop_L2):
1471 prefetchnta 0x1c0(%rsi)
1472L(shl_13_loop_L1):
1473 sub $64, %rdx
1474 movaps 0x03(%rsi), %xmm2
1475 movaps 0x13(%rsi), %xmm3
1476 movaps 0x23(%rsi), %xmm4
1477 movaps 0x33(%rsi), %xmm5
1478 movdqa %xmm5, %xmm6
1479 palignr $13, %xmm4, %xmm5
1480 lea 64(%rsi), %rsi
1481 palignr $13, %xmm3, %xmm4
1482 palignr $13, %xmm2, %xmm3
1483 lea 64(%rdi), %rdi
1484 palignr $13, %xmm1, %xmm2
1485 movdqa %xmm6, %xmm1
1486 movdqa %xmm2, -0x40(%rdi)
1487 movaps %xmm3, -0x30(%rdi)
1488 jb L(shl_13_end)
1489 movaps %xmm4, -0x20(%rdi)
1490 movaps %xmm5, -0x10(%rdi)
1491 _CET_NOTRACK jmp *%r9
1492 ud2
1493L(shl_13_end):
1494 movaps %xmm4, -0x20(%rdi)
1495 lea 64(%rdx), %rdx
1496 movaps %xmm5, -0x10(%rdi)
1497 add %rdx, %rdi
1498 movdqu %xmm0, (%r8)
1499 add %rdx, %rsi
1500 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1501
1502 .p2align 4
1503L(shl_13_bwd):
1504 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1505 cmp %rcx, %rdx
1506 movaps -0x0d(%rsi), %xmm1
1507 jb L(L13_bwd)
1508 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1509L(L13_bwd):
1510 lea -64(%rdx), %rdx
1511 _CET_NOTRACK jmp *%r9
1512 ud2
1513L(shl_13_bwd_loop_L2):
1514 prefetchnta -0x1c0(%rsi)
1515L(shl_13_bwd_loop_L1):
1516 movaps -0x1d(%rsi), %xmm2
1517 sub $0x40, %rdx
1518 movaps -0x2d(%rsi), %xmm3
1519 movaps -0x3d(%rsi), %xmm4
1520 movaps -0x4d(%rsi), %xmm5
1521 lea -0x40(%rsi), %rsi
1522 palignr $13, %xmm2, %xmm1
1523 palignr $13, %xmm3, %xmm2
1524 palignr $13, %xmm4, %xmm3
1525 palignr $13, %xmm5, %xmm4
1526
1527 movaps %xmm1, -0x10(%rdi)
1528 movaps %xmm5, %xmm1
1529
1530 movaps %xmm2, -0x20(%rdi)
1531 lea -0x40(%rdi), %rdi
1532
1533 movaps %xmm3, 0x10(%rdi)
1534 jb L(shl_13_bwd_end)
1535 movaps %xmm4, (%rdi)
1536 _CET_NOTRACK jmp *%r9
1537 ud2
1538L(shl_13_bwd_end):
1539 movaps %xmm4, (%rdi)
1540 lea 64(%rdx), %rdx
1541 movdqu %xmm0, (%r8)
1542 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1543
1544 .p2align 4
1545L(shl_14):
1546 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1547 cmp %rcx, %rdx
1548 movaps -0x0e(%rsi), %xmm1
1549 jb L(L14_fwd)
1550 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1551L(L14_fwd):
1552 lea -64(%rdx), %rdx
1553 _CET_NOTRACK jmp *%r9
1554 ud2
1555L(shl_14_loop_L2):
1556 prefetchnta 0x1c0(%rsi)
1557L(shl_14_loop_L1):
1558 sub $64, %rdx
1559 movaps 0x02(%rsi), %xmm2
1560 movaps 0x12(%rsi), %xmm3
1561 movaps 0x22(%rsi), %xmm4
1562 movaps 0x32(%rsi), %xmm5
1563 movdqa %xmm5, %xmm6
1564 palignr $14, %xmm4, %xmm5
1565 lea 64(%rsi), %rsi
1566 palignr $14, %xmm3, %xmm4
1567 palignr $14, %xmm2, %xmm3
1568 lea 64(%rdi), %rdi
1569 palignr $14, %xmm1, %xmm2
1570 movdqa %xmm6, %xmm1
1571 movdqa %xmm2, -0x40(%rdi)
1572 movaps %xmm3, -0x30(%rdi)
1573 jb L(shl_14_end)
1574 movaps %xmm4, -0x20(%rdi)
1575 movaps %xmm5, -0x10(%rdi)
1576 _CET_NOTRACK jmp *%r9
1577 ud2
1578L(shl_14_end):
1579 movaps %xmm4, -0x20(%rdi)
1580 lea 64(%rdx), %rdx
1581 movaps %xmm5, -0x10(%rdi)
1582 add %rdx, %rdi
1583 movdqu %xmm0, (%r8)
1584 add %rdx, %rsi
1585 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1586
1587 .p2align 4
1588L(shl_14_bwd):
1589 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1590 cmp %rcx, %rdx
1591 movaps -0x0e(%rsi), %xmm1
1592 jb L(L14_bwd)
1593 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1594L(L14_bwd):
1595 lea -64(%rdx), %rdx
1596 _CET_NOTRACK jmp *%r9
1597 ud2
1598L(shl_14_bwd_loop_L2):
1599 prefetchnta -0x1c0(%rsi)
1600L(shl_14_bwd_loop_L1):
1601 movaps -0x1e(%rsi), %xmm2
1602 sub $0x40, %rdx
1603 movaps -0x2e(%rsi), %xmm3
1604 movaps -0x3e(%rsi), %xmm4
1605 movaps -0x4e(%rsi), %xmm5
1606 lea -0x40(%rsi), %rsi
1607 palignr $14, %xmm2, %xmm1
1608 palignr $14, %xmm3, %xmm2
1609 palignr $14, %xmm4, %xmm3
1610 palignr $14, %xmm5, %xmm4
1611
1612 movaps %xmm1, -0x10(%rdi)
1613 movaps %xmm5, %xmm1
1614
1615 movaps %xmm2, -0x20(%rdi)
1616 lea -0x40(%rdi), %rdi
1617
1618 movaps %xmm3, 0x10(%rdi)
1619 jb L(shl_14_bwd_end)
1620 movaps %xmm4, (%rdi)
1621 _CET_NOTRACK jmp *%r9
1622 ud2
1623L(shl_14_bwd_end):
1624 movaps %xmm4, (%rdi)
1625 lea 64(%rdx), %rdx
1626 movdqu %xmm0, (%r8)
1627 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1628
1629 .p2align 4
1630L(shl_15):
1631 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1632 cmp %rcx, %rdx
1633 movaps -0x0f(%rsi), %xmm1
1634 jb L(L15_fwd)
1635 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1636L(L15_fwd):
1637 lea -64(%rdx), %rdx
1638 _CET_NOTRACK jmp *%r9
1639 ud2
1640L(shl_15_loop_L2):
1641 prefetchnta 0x1c0(%rsi)
1642L(shl_15_loop_L1):
1643 sub $64, %rdx
1644 movaps 0x01(%rsi), %xmm2
1645 movaps 0x11(%rsi), %xmm3
1646 movaps 0x21(%rsi), %xmm4
1647 movaps 0x31(%rsi), %xmm5
1648 movdqa %xmm5, %xmm6
1649 palignr $15, %xmm4, %xmm5
1650 lea 64(%rsi), %rsi
1651 palignr $15, %xmm3, %xmm4
1652 palignr $15, %xmm2, %xmm3
1653 lea 64(%rdi), %rdi
1654 palignr $15, %xmm1, %xmm2
1655 movdqa %xmm6, %xmm1
1656 movdqa %xmm2, -0x40(%rdi)
1657 movaps %xmm3, -0x30(%rdi)
1658 jb L(shl_15_end)
1659 movaps %xmm4, -0x20(%rdi)
1660 movaps %xmm5, -0x10(%rdi)
1661 _CET_NOTRACK jmp *%r9
1662 ud2
1663L(shl_15_end):
1664 movaps %xmm4, -0x20(%rdi)
1665 lea 64(%rdx), %rdx
1666 movaps %xmm5, -0x10(%rdi)
1667 add %rdx, %rdi
1668 movdqu %xmm0, (%r8)
1669 add %rdx, %rsi
1670 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1671
1672 .p2align 4
1673L(shl_15_bwd):
1674 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1675 cmp %rcx, %rdx
1676 movaps -0x0f(%rsi), %xmm1
1677 jb L(L15_bwd)
1678 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1679L(L15_bwd):
1680 lea -64(%rdx), %rdx
1681 _CET_NOTRACK jmp *%r9
1682 ud2
1683L(shl_15_bwd_loop_L2):
1684 prefetchnta -0x1c0(%rsi)
1685L(shl_15_bwd_loop_L1):
1686 movaps -0x1f(%rsi), %xmm2
1687 sub $0x40, %rdx
1688 movaps -0x2f(%rsi), %xmm3
1689 movaps -0x3f(%rsi), %xmm4
1690 movaps -0x4f(%rsi), %xmm5
1691 lea -0x40(%rsi), %rsi
1692 palignr $15, %xmm2, %xmm1
1693 palignr $15, %xmm3, %xmm2
1694 palignr $15, %xmm4, %xmm3
1695 palignr $15, %xmm5, %xmm4
1696
1697 movaps %xmm1, -0x10(%rdi)
1698 movaps %xmm5, %xmm1
1699
1700 movaps %xmm2, -0x20(%rdi)
1701 lea -0x40(%rdi), %rdi
1702
1703 movaps %xmm3, 0x10(%rdi)
1704 jb L(shl_15_bwd_end)
1705 movaps %xmm4, (%rdi)
1706 _CET_NOTRACK jmp *%r9
1707 ud2
1708L(shl_15_bwd_end):
1709 movaps %xmm4, (%rdi)
1710 lea 64(%rdx), %rdx
1711 movdqu %xmm0, (%r8)
1712 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1713
1714 .p2align 4
1715L(write_72bytes):
1716 movdqu -72(%rsi), %xmm0
1717 movdqu -56(%rsi), %xmm1
1718 mov -40(%rsi), %r8
1719 mov -32(%rsi), %r9
1720 mov -24(%rsi), %r10
1721 mov -16(%rsi), %r11
1722 mov -8(%rsi), %rcx
1723 movdqu %xmm0, -72(%rdi)
1724 movdqu %xmm1, -56(%rdi)
1725 mov %r8, -40(%rdi)
1726 mov %r9, -32(%rdi)
1727 mov %r10, -24(%rdi)
1728 mov %r11, -16(%rdi)
1729 mov %rcx, -8(%rdi)
1730 ret
1731
1732 .p2align 4
1733L(write_64bytes):
1734 movdqu -64(%rsi), %xmm0
1735 mov -48(%rsi), %rcx
1736 mov -40(%rsi), %r8
1737 mov -32(%rsi), %r9
1738 mov -24(%rsi), %r10
1739 mov -16(%rsi), %r11
1740 mov -8(%rsi), %rdx
1741 movdqu %xmm0, -64(%rdi)
1742 mov %rcx, -48(%rdi)
1743 mov %r8, -40(%rdi)
1744 mov %r9, -32(%rdi)
1745 mov %r10, -24(%rdi)
1746 mov %r11, -16(%rdi)
1747 mov %rdx, -8(%rdi)
1748 ret
1749
1750 .p2align 4
1751L(write_56bytes):
1752 movdqu -56(%rsi), %xmm0
1753 mov -40(%rsi), %r8
1754 mov -32(%rsi), %r9
1755 mov -24(%rsi), %r10
1756 mov -16(%rsi), %r11
1757 mov -8(%rsi), %rcx
1758 movdqu %xmm0, -56(%rdi)
1759 mov %r8, -40(%rdi)
1760 mov %r9, -32(%rdi)
1761 mov %r10, -24(%rdi)
1762 mov %r11, -16(%rdi)
1763 mov %rcx, -8(%rdi)
1764 ret
1765
1766 .p2align 4
1767L(write_48bytes):
1768 mov -48(%rsi), %rcx
1769 mov -40(%rsi), %r8
1770 mov -32(%rsi), %r9
1771 mov -24(%rsi), %r10
1772 mov -16(%rsi), %r11
1773 mov -8(%rsi), %rdx
1774 mov %rcx, -48(%rdi)
1775 mov %r8, -40(%rdi)
1776 mov %r9, -32(%rdi)
1777 mov %r10, -24(%rdi)
1778 mov %r11, -16(%rdi)
1779 mov %rdx, -8(%rdi)
1780 ret
1781
1782 .p2align 4
1783L(write_40bytes):
1784 mov -40(%rsi), %r8
1785 mov -32(%rsi), %r9
1786 mov -24(%rsi), %r10
1787 mov -16(%rsi), %r11
1788 mov -8(%rsi), %rdx
1789 mov %r8, -40(%rdi)
1790 mov %r9, -32(%rdi)
1791 mov %r10, -24(%rdi)
1792 mov %r11, -16(%rdi)
1793 mov %rdx, -8(%rdi)
1794 ret
1795
1796 .p2align 4
1797L(write_32bytes):
1798 mov -32(%rsi), %r9
1799 mov -24(%rsi), %r10
1800 mov -16(%rsi), %r11
1801 mov -8(%rsi), %rdx
1802 mov %r9, -32(%rdi)
1803 mov %r10, -24(%rdi)
1804 mov %r11, -16(%rdi)
1805 mov %rdx, -8(%rdi)
1806 ret
1807
1808 .p2align 4
1809L(write_24bytes):
1810 mov -24(%rsi), %r10
1811 mov -16(%rsi), %r11
1812 mov -8(%rsi), %rdx
1813 mov %r10, -24(%rdi)
1814 mov %r11, -16(%rdi)
1815 mov %rdx, -8(%rdi)
1816 ret
1817
1818 .p2align 4
1819L(write_16bytes):
1820 mov -16(%rsi), %r11
1821 mov -8(%rsi), %rdx
1822 mov %r11, -16(%rdi)
1823 mov %rdx, -8(%rdi)
1824 ret
1825
1826 .p2align 4
1827L(write_8bytes):
1828 mov -8(%rsi), %rdx
1829 mov %rdx, -8(%rdi)
1830L(write_0bytes):
1831 ret
1832
1833 .p2align 4
1834L(write_73bytes):
1835 movdqu -73(%rsi), %xmm0
1836 movdqu -57(%rsi), %xmm1
1837 mov -41(%rsi), %rcx
1838 mov -33(%rsi), %r9
1839 mov -25(%rsi), %r10
1840 mov -17(%rsi), %r11
1841 mov -9(%rsi), %r8
1842 mov -4(%rsi), %edx
1843 movdqu %xmm0, -73(%rdi)
1844 movdqu %xmm1, -57(%rdi)
1845 mov %rcx, -41(%rdi)
1846 mov %r9, -33(%rdi)
1847 mov %r10, -25(%rdi)
1848 mov %r11, -17(%rdi)
1849 mov %r8, -9(%rdi)
1850 mov %edx, -4(%rdi)
1851 ret
1852
1853 .p2align 4
1854L(write_65bytes):
1855 movdqu -65(%rsi), %xmm0
1856 movdqu -49(%rsi), %xmm1
1857 mov -33(%rsi), %r9
1858 mov -25(%rsi), %r10
1859 mov -17(%rsi), %r11
1860 mov -9(%rsi), %rcx
1861 mov -4(%rsi), %edx
1862 movdqu %xmm0, -65(%rdi)
1863 movdqu %xmm1, -49(%rdi)
1864 mov %r9, -33(%rdi)
1865 mov %r10, -25(%rdi)
1866 mov %r11, -17(%rdi)
1867 mov %rcx, -9(%rdi)
1868 mov %edx, -4(%rdi)
1869 ret
1870
1871 .p2align 4
1872L(write_57bytes):
1873 movdqu -57(%rsi), %xmm0
1874 mov -41(%rsi), %r8
1875 mov -33(%rsi), %r9
1876 mov -25(%rsi), %r10
1877 mov -17(%rsi), %r11
1878 mov -9(%rsi), %rcx
1879 mov -4(%rsi), %edx
1880 movdqu %xmm0, -57(%rdi)
1881 mov %r8, -41(%rdi)
1882 mov %r9, -33(%rdi)
1883 mov %r10, -25(%rdi)
1884 mov %r11, -17(%rdi)
1885 mov %rcx, -9(%rdi)
1886 mov %edx, -4(%rdi)
1887 ret
1888
1889 .p2align 4
1890L(write_49bytes):
1891 movdqu -49(%rsi), %xmm0
1892 mov -33(%rsi), %r9
1893 mov -25(%rsi), %r10
1894 mov -17(%rsi), %r11
1895 mov -9(%rsi), %rcx
1896 mov -4(%rsi), %edx
1897 movdqu %xmm0, -49(%rdi)
1898 mov %r9, -33(%rdi)
1899 mov %r10, -25(%rdi)
1900 mov %r11, -17(%rdi)
1901 mov %rcx, -9(%rdi)
1902 mov %edx, -4(%rdi)
1903 ret
1904
1905 .p2align 4
1906L(write_41bytes):
1907 mov -41(%rsi), %r8
1908 mov -33(%rsi), %r9
1909 mov -25(%rsi), %r10
1910 mov -17(%rsi), %r11
1911 mov -9(%rsi), %rcx
1912 mov -1(%rsi), %dl
1913 mov %r8, -41(%rdi)
1914 mov %r9, -33(%rdi)
1915 mov %r10, -25(%rdi)
1916 mov %r11, -17(%rdi)
1917 mov %rcx, -9(%rdi)
1918 mov %dl, -1(%rdi)
1919 ret
1920
1921 .p2align 4
1922L(write_33bytes):
1923 mov -33(%rsi), %r9
1924 mov -25(%rsi), %r10
1925 mov -17(%rsi), %r11
1926 mov -9(%rsi), %rcx
1927 mov -1(%rsi), %dl
1928 mov %r9, -33(%rdi)
1929 mov %r10, -25(%rdi)
1930 mov %r11, -17(%rdi)
1931 mov %rcx, -9(%rdi)
1932 mov %dl, -1(%rdi)
1933 ret
1934
1935 .p2align 4
1936L(write_25bytes):
1937 mov -25(%rsi), %r10
1938 mov -17(%rsi), %r11
1939 mov -9(%rsi), %rcx
1940 mov -1(%rsi), %dl
1941 mov %r10, -25(%rdi)
1942 mov %r11, -17(%rdi)
1943 mov %rcx, -9(%rdi)
1944 mov %dl, -1(%rdi)
1945 ret
1946
1947 .p2align 4
1948L(write_17bytes):
1949 mov -17(%rsi), %r11
1950 mov -9(%rsi), %rcx
1951 mov -4(%rsi), %edx
1952 mov %r11, -17(%rdi)
1953 mov %rcx, -9(%rdi)
1954 mov %edx, -4(%rdi)
1955 ret
1956
1957 .p2align 4
1958L(write_9bytes):
1959 mov -9(%rsi), %rcx
1960 mov -4(%rsi), %edx
1961 mov %rcx, -9(%rdi)
1962 mov %edx, -4(%rdi)
1963 ret
1964
1965 .p2align 4
1966L(write_1bytes):
1967 mov -1(%rsi), %dl
1968 mov %dl, -1(%rdi)
1969 ret
1970
1971 .p2align 4
1972L(write_74bytes):
1973 movdqu -74(%rsi), %xmm0
1974 movdqu -58(%rsi), %xmm1
1975 mov -42(%rsi), %r8
1976 mov -34(%rsi), %r9
1977 mov -26(%rsi), %r10
1978 mov -18(%rsi), %r11
1979 mov -10(%rsi), %rcx
1980 mov -4(%rsi), %edx
1981 movdqu %xmm0, -74(%rdi)
1982 movdqu %xmm1, -58(%rdi)
1983 mov %r8, -42(%rdi)
1984 mov %r9, -34(%rdi)
1985 mov %r10, -26(%rdi)
1986 mov %r11, -18(%rdi)
1987 mov %rcx, -10(%rdi)
1988 mov %edx, -4(%rdi)
1989 ret
1990
1991 .p2align 4
1992L(write_66bytes):
1993 movdqu -66(%rsi), %xmm0
1994 movdqu -50(%rsi), %xmm1
1995 mov -42(%rsi), %r8
1996 mov -34(%rsi), %r9
1997 mov -26(%rsi), %r10
1998 mov -18(%rsi), %r11
1999 mov -10(%rsi), %rcx
2000 mov -4(%rsi), %edx
2001 movdqu %xmm0, -66(%rdi)
2002 movdqu %xmm1, -50(%rdi)
2003 mov %r8, -42(%rdi)
2004 mov %r9, -34(%rdi)
2005 mov %r10, -26(%rdi)
2006 mov %r11, -18(%rdi)
2007 mov %rcx, -10(%rdi)
2008 mov %edx, -4(%rdi)
2009 ret
2010
2011 .p2align 4
2012L(write_58bytes):
2013 movdqu -58(%rsi), %xmm1
2014 mov -42(%rsi), %r8
2015 mov -34(%rsi), %r9
2016 mov -26(%rsi), %r10
2017 mov -18(%rsi), %r11
2018 mov -10(%rsi), %rcx
2019 mov -4(%rsi), %edx
2020 movdqu %xmm1, -58(%rdi)
2021 mov %r8, -42(%rdi)
2022 mov %r9, -34(%rdi)
2023 mov %r10, -26(%rdi)
2024 mov %r11, -18(%rdi)
2025 mov %rcx, -10(%rdi)
2026 mov %edx, -4(%rdi)
2027 ret
2028
2029 .p2align 4
2030L(write_50bytes):
2031 movdqu -50(%rsi), %xmm0
2032 mov -34(%rsi), %r9
2033 mov -26(%rsi), %r10
2034 mov -18(%rsi), %r11
2035 mov -10(%rsi), %rcx
2036 mov -4(%rsi), %edx
2037 movdqu %xmm0, -50(%rdi)
2038 mov %r9, -34(%rdi)
2039 mov %r10, -26(%rdi)
2040 mov %r11, -18(%rdi)
2041 mov %rcx, -10(%rdi)
2042 mov %edx, -4(%rdi)
2043 ret
2044
2045 .p2align 4
2046L(write_42bytes):
2047 mov -42(%rsi), %r8
2048 mov -34(%rsi), %r9
2049 mov -26(%rsi), %r10
2050 mov -18(%rsi), %r11
2051 mov -10(%rsi), %rcx
2052 mov -4(%rsi), %edx
2053 mov %r8, -42(%rdi)
2054 mov %r9, -34(%rdi)
2055 mov %r10, -26(%rdi)
2056 mov %r11, -18(%rdi)
2057 mov %rcx, -10(%rdi)
2058 mov %edx, -4(%rdi)
2059 ret
2060
2061 .p2align 4
2062L(write_34bytes):
2063 mov -34(%rsi), %r9
2064 mov -26(%rsi), %r10
2065 mov -18(%rsi), %r11
2066 mov -10(%rsi), %rcx
2067 mov -4(%rsi), %edx
2068 mov %r9, -34(%rdi)
2069 mov %r10, -26(%rdi)
2070 mov %r11, -18(%rdi)
2071 mov %rcx, -10(%rdi)
2072 mov %edx, -4(%rdi)
2073 ret
2074
2075 .p2align 4
2076L(write_26bytes):
2077 mov -26(%rsi), %r10
2078 mov -18(%rsi), %r11
2079 mov -10(%rsi), %rcx
2080 mov -4(%rsi), %edx
2081 mov %r10, -26(%rdi)
2082 mov %r11, -18(%rdi)
2083 mov %rcx, -10(%rdi)
2084 mov %edx, -4(%rdi)
2085 ret
2086
2087 .p2align 4
2088L(write_18bytes):
2089 mov -18(%rsi), %r11
2090 mov -10(%rsi), %rcx
2091 mov -4(%rsi), %edx
2092 mov %r11, -18(%rdi)
2093 mov %rcx, -10(%rdi)
2094 mov %edx, -4(%rdi)
2095 ret
2096
2097 .p2align 4
2098L(write_10bytes):
2099 mov -10(%rsi), %rcx
2100 mov -4(%rsi), %edx
2101 mov %rcx, -10(%rdi)
2102 mov %edx, -4(%rdi)
2103 ret
2104
2105 .p2align 4
2106L(write_2bytes):
2107 mov -2(%rsi), %dx
2108 mov %dx, -2(%rdi)
2109 ret
2110
2111 .p2align 4
2112L(write_75bytes):
2113 movdqu -75(%rsi), %xmm0
2114 movdqu -59(%rsi), %xmm1
2115 mov -43(%rsi), %r8
2116 mov -35(%rsi), %r9
2117 mov -27(%rsi), %r10
2118 mov -19(%rsi), %r11
2119 mov -11(%rsi), %rcx
2120 mov -4(%rsi), %edx
2121 movdqu %xmm0, -75(%rdi)
2122 movdqu %xmm1, -59(%rdi)
2123 mov %r8, -43(%rdi)
2124 mov %r9, -35(%rdi)
2125 mov %r10, -27(%rdi)
2126 mov %r11, -19(%rdi)
2127 mov %rcx, -11(%rdi)
2128 mov %edx, -4(%rdi)
2129 ret
2130
2131 .p2align 4
2132L(write_67bytes):
2133 movdqu -67(%rsi), %xmm0
2134 movdqu -59(%rsi), %xmm1
2135 mov -43(%rsi), %r8
2136 mov -35(%rsi), %r9
2137 mov -27(%rsi), %r10
2138 mov -19(%rsi), %r11
2139 mov -11(%rsi), %rcx
2140 mov -4(%rsi), %edx
2141 movdqu %xmm0, -67(%rdi)
2142 movdqu %xmm1, -59(%rdi)
2143 mov %r8, -43(%rdi)
2144 mov %r9, -35(%rdi)
2145 mov %r10, -27(%rdi)
2146 mov %r11, -19(%rdi)
2147 mov %rcx, -11(%rdi)
2148 mov %edx, -4(%rdi)
2149 ret
2150
2151 .p2align 4
2152L(write_59bytes):
2153 movdqu -59(%rsi), %xmm0
2154 mov -43(%rsi), %r8
2155 mov -35(%rsi), %r9
2156 mov -27(%rsi), %r10
2157 mov -19(%rsi), %r11
2158 mov -11(%rsi), %rcx
2159 mov -4(%rsi), %edx
2160 movdqu %xmm0, -59(%rdi)
2161 mov %r8, -43(%rdi)
2162 mov %r9, -35(%rdi)
2163 mov %r10, -27(%rdi)
2164 mov %r11, -19(%rdi)
2165 mov %rcx, -11(%rdi)
2166 mov %edx, -4(%rdi)
2167 ret
2168
2169 .p2align 4
2170L(write_51bytes):
2171 movdqu -51(%rsi), %xmm0
2172 mov -35(%rsi), %r9
2173 mov -27(%rsi), %r10
2174 mov -19(%rsi), %r11
2175 mov -11(%rsi), %rcx
2176 mov -4(%rsi), %edx
2177 movdqu %xmm0, -51(%rdi)
2178 mov %r9, -35(%rdi)
2179 mov %r10, -27(%rdi)
2180 mov %r11, -19(%rdi)
2181 mov %rcx, -11(%rdi)
2182 mov %edx, -4(%rdi)
2183 ret
2184
2185 .p2align 4
2186L(write_43bytes):
2187 mov -43(%rsi), %r8
2188 mov -35(%rsi), %r9
2189 mov -27(%rsi), %r10
2190 mov -19(%rsi), %r11
2191 mov -11(%rsi), %rcx
2192 mov -4(%rsi), %edx
2193 mov %r8, -43(%rdi)
2194 mov %r9, -35(%rdi)
2195 mov %r10, -27(%rdi)
2196 mov %r11, -19(%rdi)
2197 mov %rcx, -11(%rdi)
2198 mov %edx, -4(%rdi)
2199 ret
2200
2201 .p2align 4
2202L(write_35bytes):
2203 mov -35(%rsi), %r9
2204 mov -27(%rsi), %r10
2205 mov -19(%rsi), %r11
2206 mov -11(%rsi), %rcx
2207 mov -4(%rsi), %edx
2208 mov %r9, -35(%rdi)
2209 mov %r10, -27(%rdi)
2210 mov %r11, -19(%rdi)
2211 mov %rcx, -11(%rdi)
2212 mov %edx, -4(%rdi)
2213 ret
2214
2215 .p2align 4
2216L(write_27bytes):
2217 mov -27(%rsi), %r10
2218 mov -19(%rsi), %r11
2219 mov -11(%rsi), %rcx
2220 mov -4(%rsi), %edx
2221 mov %r10, -27(%rdi)
2222 mov %r11, -19(%rdi)
2223 mov %rcx, -11(%rdi)
2224 mov %edx, -4(%rdi)
2225 ret
2226
2227 .p2align 4
2228L(write_19bytes):
2229 mov -19(%rsi), %r11
2230 mov -11(%rsi), %rcx
2231 mov -4(%rsi), %edx
2232 mov %r11, -19(%rdi)
2233 mov %rcx, -11(%rdi)
2234 mov %edx, -4(%rdi)
2235 ret
2236
2237 .p2align 4
2238L(write_11bytes):
2239 mov -11(%rsi), %rcx
2240 mov -4(%rsi), %edx
2241 mov %rcx, -11(%rdi)
2242 mov %edx, -4(%rdi)
2243 ret
2244
2245 .p2align 4
2246L(write_3bytes):
2247 mov -3(%rsi), %dx
2248 mov -2(%rsi), %cx
2249 mov %dx, -3(%rdi)
2250 mov %cx, -2(%rdi)
2251 ret
2252
2253 .p2align 4
2254L(write_76bytes):
2255 movdqu -76(%rsi), %xmm0
2256 movdqu -60(%rsi), %xmm1
2257 mov -44(%rsi), %r8
2258 mov -36(%rsi), %r9
2259 mov -28(%rsi), %r10
2260 mov -20(%rsi), %r11
2261 mov -12(%rsi), %rcx
2262 mov -4(%rsi), %edx
2263 movdqu %xmm0, -76(%rdi)
2264 movdqu %xmm1, -60(%rdi)
2265 mov %r8, -44(%rdi)
2266 mov %r9, -36(%rdi)
2267 mov %r10, -28(%rdi)
2268 mov %r11, -20(%rdi)
2269 mov %rcx, -12(%rdi)
2270 mov %edx, -4(%rdi)
2271 ret
2272
2273 .p2align 4
2274L(write_68bytes):
2275 movdqu -68(%rsi), %xmm0
2276 movdqu -52(%rsi), %xmm1
2277 mov -36(%rsi), %r9
2278 mov -28(%rsi), %r10
2279 mov -20(%rsi), %r11
2280 mov -12(%rsi), %rcx
2281 mov -4(%rsi), %edx
2282 movdqu %xmm0, -68(%rdi)
2283 movdqu %xmm1, -52(%rdi)
2284 mov %r9, -36(%rdi)
2285 mov %r10, -28(%rdi)
2286 mov %r11, -20(%rdi)
2287 mov %rcx, -12(%rdi)
2288 mov %edx, -4(%rdi)
2289 ret
2290
2291 .p2align 4
2292L(write_60bytes):
2293 movdqu -60(%rsi), %xmm0
2294 mov -44(%rsi), %r8
2295 mov -36(%rsi), %r9
2296 mov -28(%rsi), %r10
2297 mov -20(%rsi), %r11
2298 mov -12(%rsi), %rcx
2299 mov -4(%rsi), %edx
2300 movdqu %xmm0, -60(%rdi)
2301 mov %r8, -44(%rdi)
2302 mov %r9, -36(%rdi)
2303 mov %r10, -28(%rdi)
2304 mov %r11, -20(%rdi)
2305 mov %rcx, -12(%rdi)
2306 mov %edx, -4(%rdi)
2307 ret
2308
2309 .p2align 4
2310L(write_52bytes):
2311 movdqu -52(%rsi), %xmm0
2312 mov -36(%rsi), %r9
2313 mov -28(%rsi), %r10
2314 mov -20(%rsi), %r11
2315 mov -12(%rsi), %rcx
2316 mov -4(%rsi), %edx
2317 movdqu %xmm0, -52(%rdi)
2318 mov %r9, -36(%rdi)
2319 mov %r10, -28(%rdi)
2320 mov %r11, -20(%rdi)
2321 mov %rcx, -12(%rdi)
2322 mov %edx, -4(%rdi)
2323 ret
2324
2325 .p2align 4
2326L(write_44bytes):
2327 mov -44(%rsi), %r8
2328 mov -36(%rsi), %r9
2329 mov -28(%rsi), %r10
2330 mov -20(%rsi), %r11
2331 mov -12(%rsi), %rcx
2332 mov -4(%rsi), %edx
2333 mov %r8, -44(%rdi)
2334 mov %r9, -36(%rdi)
2335 mov %r10, -28(%rdi)
2336 mov %r11, -20(%rdi)
2337 mov %rcx, -12(%rdi)
2338 mov %edx, -4(%rdi)
2339 ret
2340
2341 .p2align 4
2342L(write_36bytes):
2343 mov -36(%rsi), %r9
2344 mov -28(%rsi), %r10
2345 mov -20(%rsi), %r11
2346 mov -12(%rsi), %rcx
2347 mov -4(%rsi), %edx
2348 mov %r9, -36(%rdi)
2349 mov %r10, -28(%rdi)
2350 mov %r11, -20(%rdi)
2351 mov %rcx, -12(%rdi)
2352 mov %edx, -4(%rdi)
2353 ret
2354
2355 .p2align 4
2356L(write_28bytes):
2357 mov -28(%rsi), %r10
2358 mov -20(%rsi), %r11
2359 mov -12(%rsi), %rcx
2360 mov -4(%rsi), %edx
2361 mov %r10, -28(%rdi)
2362 mov %r11, -20(%rdi)
2363 mov %rcx, -12(%rdi)
2364 mov %edx, -4(%rdi)
2365 ret
2366
2367 .p2align 4
2368L(write_20bytes):
2369 mov -20(%rsi), %r11
2370 mov -12(%rsi), %rcx
2371 mov -4(%rsi), %edx
2372 mov %r11, -20(%rdi)
2373 mov %rcx, -12(%rdi)
2374 mov %edx, -4(%rdi)
2375 ret
2376
2377 .p2align 4
2378L(write_12bytes):
2379 mov -12(%rsi), %rcx
2380 mov -4(%rsi), %edx
2381 mov %rcx, -12(%rdi)
2382 mov %edx, -4(%rdi)
2383 ret
2384
2385 .p2align 4
2386L(write_4bytes):
2387 mov -4(%rsi), %edx
2388 mov %edx, -4(%rdi)
2389 ret
2390
2391 .p2align 4
2392L(write_77bytes):
2393 movdqu -77(%rsi), %xmm0
2394 movdqu -61(%rsi), %xmm1
2395 mov -45(%rsi), %r8
2396 mov -37(%rsi), %r9
2397 mov -29(%rsi), %r10
2398 mov -21(%rsi), %r11
2399 mov -13(%rsi), %rcx
2400 mov -8(%rsi), %rdx
2401 movdqu %xmm0, -77(%rdi)
2402 movdqu %xmm1, -61(%rdi)
2403 mov %r8, -45(%rdi)
2404 mov %r9, -37(%rdi)
2405 mov %r10, -29(%rdi)
2406 mov %r11, -21(%rdi)
2407 mov %rcx, -13(%rdi)
2408 mov %rdx, -8(%rdi)
2409 ret
2410
2411 .p2align 4
2412L(write_69bytes):
2413 movdqu -69(%rsi), %xmm0
2414 movdqu -53(%rsi), %xmm1
2415 mov -37(%rsi), %r9
2416 mov -29(%rsi), %r10
2417 mov -21(%rsi), %r11
2418 mov -13(%rsi), %rcx
2419 mov -8(%rsi), %rdx
2420 movdqu %xmm0, -69(%rdi)
2421 movdqu %xmm1, -53(%rdi)
2422 mov %r9, -37(%rdi)
2423 mov %r10, -29(%rdi)
2424 mov %r11, -21(%rdi)
2425 mov %rcx, -13(%rdi)
2426 mov %rdx, -8(%rdi)
2427 ret
2428
2429 .p2align 4
2430L(write_61bytes):
2431 movdqu -61(%rsi), %xmm0
2432 mov -45(%rsi), %r8
2433 mov -37(%rsi), %r9
2434 mov -29(%rsi), %r10
2435 mov -21(%rsi), %r11
2436 mov -13(%rsi), %rcx
2437 mov -8(%rsi), %rdx
2438 movdqu %xmm0, -61(%rdi)
2439 mov %r8, -45(%rdi)
2440 mov %r9, -37(%rdi)
2441 mov %r10, -29(%rdi)
2442 mov %r11, -21(%rdi)
2443 mov %rcx, -13(%rdi)
2444 mov %rdx, -8(%rdi)
2445 ret
2446
2447 .p2align 4
2448L(write_53bytes):
2449 movdqu -53(%rsi), %xmm0
2450 mov -45(%rsi), %r8
2451 mov -37(%rsi), %r9
2452 mov -29(%rsi), %r10
2453 mov -21(%rsi), %r11
2454 mov -13(%rsi), %rcx
2455 mov -8(%rsi), %rdx
2456 movdqu %xmm0, -53(%rdi)
2457 mov %r9, -37(%rdi)
2458 mov %r10, -29(%rdi)
2459 mov %r11, -21(%rdi)
2460 mov %rcx, -13(%rdi)
2461 mov %rdx, -8(%rdi)
2462 ret
2463
2464 .p2align 4
2465L(write_45bytes):
2466 mov -45(%rsi), %r8
2467 mov -37(%rsi), %r9
2468 mov -29(%rsi), %r10
2469 mov -21(%rsi), %r11
2470 mov -13(%rsi), %rcx
2471 mov -8(%rsi), %rdx
2472 mov %r8, -45(%rdi)
2473 mov %r9, -37(%rdi)
2474 mov %r10, -29(%rdi)
2475 mov %r11, -21(%rdi)
2476 mov %rcx, -13(%rdi)
2477 mov %rdx, -8(%rdi)
2478 ret
2479
2480 .p2align 4
2481L(write_37bytes):
2482 mov -37(%rsi), %r9
2483 mov -29(%rsi), %r10
2484 mov -21(%rsi), %r11
2485 mov -13(%rsi), %rcx
2486 mov -8(%rsi), %rdx
2487 mov %r9, -37(%rdi)
2488 mov %r10, -29(%rdi)
2489 mov %r11, -21(%rdi)
2490 mov %rcx, -13(%rdi)
2491 mov %rdx, -8(%rdi)
2492 ret
2493
2494 .p2align 4
2495L(write_29bytes):
2496 mov -29(%rsi), %r10
2497 mov -21(%rsi), %r11
2498 mov -13(%rsi), %rcx
2499 mov -8(%rsi), %rdx
2500 mov %r10, -29(%rdi)
2501 mov %r11, -21(%rdi)
2502 mov %rcx, -13(%rdi)
2503 mov %rdx, -8(%rdi)
2504 ret
2505
2506 .p2align 4
2507L(write_21bytes):
2508 mov -21(%rsi), %r11
2509 mov -13(%rsi), %rcx
2510 mov -8(%rsi), %rdx
2511 mov %r11, -21(%rdi)
2512 mov %rcx, -13(%rdi)
2513 mov %rdx, -8(%rdi)
2514 ret
2515
2516 .p2align 4
2517L(write_13bytes):
2518 mov -13(%rsi), %rcx
2519 mov -8(%rsi), %rdx
2520 mov %rcx, -13(%rdi)
2521 mov %rdx, -8(%rdi)
2522 ret
2523
2524 .p2align 4
2525L(write_5bytes):
2526 mov -5(%rsi), %edx
2527 mov -4(%rsi), %ecx
2528 mov %edx, -5(%rdi)
2529 mov %ecx, -4(%rdi)
2530 ret
2531
2532 .p2align 4
2533L(write_78bytes):
2534 movdqu -78(%rsi), %xmm0
2535 movdqu -62(%rsi), %xmm1
2536 mov -46(%rsi), %r8
2537 mov -38(%rsi), %r9
2538 mov -30(%rsi), %r10
2539 mov -22(%rsi), %r11
2540 mov -14(%rsi), %rcx
2541 mov -8(%rsi), %rdx
2542 movdqu %xmm0, -78(%rdi)
2543 movdqu %xmm1, -62(%rdi)
2544 mov %r8, -46(%rdi)
2545 mov %r9, -38(%rdi)
2546 mov %r10, -30(%rdi)
2547 mov %r11, -22(%rdi)
2548 mov %rcx, -14(%rdi)
2549 mov %rdx, -8(%rdi)
2550 ret
2551
2552 .p2align 4
2553L(write_70bytes):
2554 movdqu -70(%rsi), %xmm0
2555 movdqu -54(%rsi), %xmm1
2556 mov -38(%rsi), %r9
2557 mov -30(%rsi), %r10
2558 mov -22(%rsi), %r11
2559 mov -14(%rsi), %rcx
2560 mov -8(%rsi), %rdx
2561 movdqu %xmm0, -70(%rdi)
2562 movdqu %xmm1, -54(%rdi)
2563 mov %r9, -38(%rdi)
2564 mov %r10, -30(%rdi)
2565 mov %r11, -22(%rdi)
2566 mov %rcx, -14(%rdi)
2567 mov %rdx, -8(%rdi)
2568 ret
2569
2570 .p2align 4
2571L(write_62bytes):
2572 movdqu -62(%rsi), %xmm0
2573 mov -46(%rsi), %r8
2574 mov -38(%rsi), %r9
2575 mov -30(%rsi), %r10
2576 mov -22(%rsi), %r11
2577 mov -14(%rsi), %rcx
2578 mov -8(%rsi), %rdx
2579 movdqu %xmm0, -62(%rdi)
2580 mov %r8, -46(%rdi)
2581 mov %r9, -38(%rdi)
2582 mov %r10, -30(%rdi)
2583 mov %r11, -22(%rdi)
2584 mov %rcx, -14(%rdi)
2585 mov %rdx, -8(%rdi)
2586 ret
2587
2588 .p2align 4
2589L(write_54bytes):
2590 movdqu -54(%rsi), %xmm0
2591 mov -38(%rsi), %r9
2592 mov -30(%rsi), %r10
2593 mov -22(%rsi), %r11
2594 mov -14(%rsi), %rcx
2595 mov -8(%rsi), %rdx
2596 movdqu %xmm0, -54(%rdi)
2597 mov %r9, -38(%rdi)
2598 mov %r10, -30(%rdi)
2599 mov %r11, -22(%rdi)
2600 mov %rcx, -14(%rdi)
2601 mov %rdx, -8(%rdi)
2602 ret
2603
2604 .p2align 4
2605L(write_46bytes):
2606 mov -46(%rsi), %r8
2607 mov -38(%rsi), %r9
2608 mov -30(%rsi), %r10
2609 mov -22(%rsi), %r11
2610 mov -14(%rsi), %rcx
2611 mov -8(%rsi), %rdx
2612 mov %r8, -46(%rdi)
2613 mov %r9, -38(%rdi)
2614 mov %r10, -30(%rdi)
2615 mov %r11, -22(%rdi)
2616 mov %rcx, -14(%rdi)
2617 mov %rdx, -8(%rdi)
2618 ret
2619
2620 .p2align 4
2621L(write_38bytes):
2622 mov -38(%rsi), %r9
2623 mov -30(%rsi), %r10
2624 mov -22(%rsi), %r11
2625 mov -14(%rsi), %rcx
2626 mov -8(%rsi), %rdx
2627 mov %r9, -38(%rdi)
2628 mov %r10, -30(%rdi)
2629 mov %r11, -22(%rdi)
2630 mov %rcx, -14(%rdi)
2631 mov %rdx, -8(%rdi)
2632 ret
2633
2634 .p2align 4
2635L(write_30bytes):
2636 mov -30(%rsi), %r10
2637 mov -22(%rsi), %r11
2638 mov -14(%rsi), %rcx
2639 mov -8(%rsi), %rdx
2640 mov %r10, -30(%rdi)
2641 mov %r11, -22(%rdi)
2642 mov %rcx, -14(%rdi)
2643 mov %rdx, -8(%rdi)
2644 ret
2645
2646 .p2align 4
2647L(write_22bytes):
2648 mov -22(%rsi), %r11
2649 mov -14(%rsi), %rcx
2650 mov -8(%rsi), %rdx
2651 mov %r11, -22(%rdi)
2652 mov %rcx, -14(%rdi)
2653 mov %rdx, -8(%rdi)
2654 ret
2655
2656 .p2align 4
2657L(write_14bytes):
2658 mov -14(%rsi), %rcx
2659 mov -8(%rsi), %rdx
2660 mov %rcx, -14(%rdi)
2661 mov %rdx, -8(%rdi)
2662 ret
2663
2664 .p2align 4
2665L(write_6bytes):
2666 mov -6(%rsi), %edx
2667 mov -4(%rsi), %ecx
2668 mov %edx, -6(%rdi)
2669 mov %ecx, -4(%rdi)
2670 ret
2671
2672 .p2align 4
2673L(write_79bytes):
2674 movdqu -79(%rsi), %xmm0
2675 movdqu -63(%rsi), %xmm1
2676 mov -47(%rsi), %r8
2677 mov -39(%rsi), %r9
2678 mov -31(%rsi), %r10
2679 mov -23(%rsi), %r11
2680 mov -15(%rsi), %rcx
2681 mov -8(%rsi), %rdx
2682 movdqu %xmm0, -79(%rdi)
2683 movdqu %xmm1, -63(%rdi)
2684 mov %r8, -47(%rdi)
2685 mov %r9, -39(%rdi)
2686 mov %r10, -31(%rdi)
2687 mov %r11, -23(%rdi)
2688 mov %rcx, -15(%rdi)
2689 mov %rdx, -8(%rdi)
2690 ret
2691
2692 .p2align 4
2693L(write_71bytes):
2694 movdqu -71(%rsi), %xmm0
2695 movdqu -55(%rsi), %xmm1
2696 mov -39(%rsi), %r9
2697 mov -31(%rsi), %r10
2698 mov -23(%rsi), %r11
2699 mov -15(%rsi), %rcx
2700 mov -8(%rsi), %rdx
2701 movdqu %xmm0, -71(%rdi)
2702 movdqu %xmm1, -55(%rdi)
2703 mov %r9, -39(%rdi)
2704 mov %r10, -31(%rdi)
2705 mov %r11, -23(%rdi)
2706 mov %rcx, -15(%rdi)
2707 mov %rdx, -8(%rdi)
2708 ret
2709
2710 .p2align 4
2711L(write_63bytes):
2712 movdqu -63(%rsi), %xmm0
2713 mov -47(%rsi), %r8
2714 mov -39(%rsi), %r9
2715 mov -31(%rsi), %r10
2716 mov -23(%rsi), %r11
2717 mov -15(%rsi), %rcx
2718 mov -8(%rsi), %rdx
2719 movdqu %xmm0, -63(%rdi)
2720 mov %r8, -47(%rdi)
2721 mov %r9, -39(%rdi)
2722 mov %r10, -31(%rdi)
2723 mov %r11, -23(%rdi)
2724 mov %rcx, -15(%rdi)
2725 mov %rdx, -8(%rdi)
2726 ret
2727
2728 .p2align 4
2729L(write_55bytes):
2730 movdqu -55(%rsi), %xmm0
2731 mov -39(%rsi), %r9
2732 mov -31(%rsi), %r10
2733 mov -23(%rsi), %r11
2734 mov -15(%rsi), %rcx
2735 mov -8(%rsi), %rdx
2736 movdqu %xmm0, -55(%rdi)
2737 mov %r9, -39(%rdi)
2738 mov %r10, -31(%rdi)
2739 mov %r11, -23(%rdi)
2740 mov %rcx, -15(%rdi)
2741 mov %rdx, -8(%rdi)
2742 ret
2743
2744 .p2align 4
2745L(write_47bytes):
2746 mov -47(%rsi), %r8
2747 mov -39(%rsi), %r9
2748 mov -31(%rsi), %r10
2749 mov -23(%rsi), %r11
2750 mov -15(%rsi), %rcx
2751 mov -8(%rsi), %rdx
2752 mov %r8, -47(%rdi)
2753 mov %r9, -39(%rdi)
2754 mov %r10, -31(%rdi)
2755 mov %r11, -23(%rdi)
2756 mov %rcx, -15(%rdi)
2757 mov %rdx, -8(%rdi)
2758 ret
2759
2760 .p2align 4
2761L(write_39bytes):
2762 mov -39(%rsi), %r9
2763 mov -31(%rsi), %r10
2764 mov -23(%rsi), %r11
2765 mov -15(%rsi), %rcx
2766 mov -8(%rsi), %rdx
2767 mov %r9, -39(%rdi)
2768 mov %r10, -31(%rdi)
2769 mov %r11, -23(%rdi)
2770 mov %rcx, -15(%rdi)
2771 mov %rdx, -8(%rdi)
2772 ret
2773
2774 .p2align 4
2775L(write_31bytes):
2776 mov -31(%rsi), %r10
2777 mov -23(%rsi), %r11
2778 mov -15(%rsi), %rcx
2779 mov -8(%rsi), %rdx
2780 mov %r10, -31(%rdi)
2781 mov %r11, -23(%rdi)
2782 mov %rcx, -15(%rdi)
2783 mov %rdx, -8(%rdi)
2784 ret
2785
2786 .p2align 4
2787L(write_23bytes):
2788 mov -23(%rsi), %r11
2789 mov -15(%rsi), %rcx
2790 mov -8(%rsi), %rdx
2791 mov %r11, -23(%rdi)
2792 mov %rcx, -15(%rdi)
2793 mov %rdx, -8(%rdi)
2794 ret
2795
2796 .p2align 4
2797L(write_15bytes):
2798 mov -15(%rsi), %rcx
2799 mov -8(%rsi), %rdx
2800 mov %rcx, -15(%rdi)
2801 mov %rdx, -8(%rdi)
2802 ret
2803
2804 .p2align 4
2805L(write_7bytes):
2806 mov -7(%rsi), %edx
2807 mov -4(%rsi), %ecx
2808 mov %edx, -7(%rdi)
2809 mov %ecx, -4(%rdi)
2810 ret
2811
2812 .p2align 4
2813L(large_page_fwd):
2814 movdqu (%rsi), %xmm1
2815 lea 16(%rsi), %rsi
2816 movdqu %xmm0, (%r8)
2817 movntdq %xmm1, (%rdi)
2818 lea 16(%rdi), %rdi
2819 lea -0x90(%rdx), %rdx
2820#ifdef USE_AS_MEMMOVE
2821 mov %rsi, %r9
2822 sub %rdi, %r9
2823 cmp %rdx, %r9
2824 jae L(memmove_is_memcpy_fwd)
2825 shl $2, %rcx
2826 cmp %rcx, %rdx
2827 jb L(ll_cache_copy_fwd_start)
2828L(memmove_is_memcpy_fwd):
2829#endif
2830L(large_page_loop):
2831 movdqu (%rsi), %xmm0
2832 movdqu 0x10(%rsi), %xmm1
2833 movdqu 0x20(%rsi), %xmm2
2834 movdqu 0x30(%rsi), %xmm3
2835 movdqu 0x40(%rsi), %xmm4
2836 movdqu 0x50(%rsi), %xmm5
2837 movdqu 0x60(%rsi), %xmm6
2838 movdqu 0x70(%rsi), %xmm7
2839 lea 0x80(%rsi), %rsi
2840
2841 sub $0x80, %rdx
2842 movntdq %xmm0, (%rdi)
2843 movntdq %xmm1, 0x10(%rdi)
2844 movntdq %xmm2, 0x20(%rdi)
2845 movntdq %xmm3, 0x30(%rdi)
2846 movntdq %xmm4, 0x40(%rdi)
2847 movntdq %xmm5, 0x50(%rdi)
2848 movntdq %xmm6, 0x60(%rdi)
2849 movntdq %xmm7, 0x70(%rdi)
2850 lea 0x80(%rdi), %rdi
2851 jae L(large_page_loop)
2852 cmp $-0x40, %rdx
2853 lea 0x80(%rdx), %rdx
2854 jl L(large_page_less_64bytes)
2855
2856 movdqu (%rsi), %xmm0
2857 movdqu 0x10(%rsi), %xmm1
2858 movdqu 0x20(%rsi), %xmm2
2859 movdqu 0x30(%rsi), %xmm3
2860 lea 0x40(%rsi), %rsi
2861
2862 movntdq %xmm0, (%rdi)
2863 movntdq %xmm1, 0x10(%rdi)
2864 movntdq %xmm2, 0x20(%rdi)
2865 movntdq %xmm3, 0x30(%rdi)
2866 lea 0x40(%rdi), %rdi
2867 sub $0x40, %rdx
2868L(large_page_less_64bytes):
2869 add %rdx, %rsi
2870 add %rdx, %rdi
2871 sfence
2872 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2873
2874#ifdef USE_AS_MEMMOVE
2875 .p2align 4
2876L(ll_cache_copy_fwd_start):
2877 prefetcht0 0x1c0(%rsi)
2878 prefetcht0 0x200(%rsi)
2879 movdqu (%rsi), %xmm0
2880 movdqu 0x10(%rsi), %xmm1
2881 movdqu 0x20(%rsi), %xmm2
2882 movdqu 0x30(%rsi), %xmm3
2883 movdqu 0x40(%rsi), %xmm4
2884 movdqu 0x50(%rsi), %xmm5
2885 movdqu 0x60(%rsi), %xmm6
2886 movdqu 0x70(%rsi), %xmm7
2887 lea 0x80(%rsi), %rsi
2888
2889 sub $0x80, %rdx
2890 movaps %xmm0, (%rdi)
2891 movaps %xmm1, 0x10(%rdi)
2892 movaps %xmm2, 0x20(%rdi)
2893 movaps %xmm3, 0x30(%rdi)
2894 movaps %xmm4, 0x40(%rdi)
2895 movaps %xmm5, 0x50(%rdi)
2896 movaps %xmm6, 0x60(%rdi)
2897 movaps %xmm7, 0x70(%rdi)
2898 lea 0x80(%rdi), %rdi
2899 jae L(ll_cache_copy_fwd_start)
2900 cmp $-0x40, %rdx
2901 lea 0x80(%rdx), %rdx
2902 jl L(large_page_ll_less_fwd_64bytes)
2903
2904 movdqu (%rsi), %xmm0
2905 movdqu 0x10(%rsi), %xmm1
2906 movdqu 0x20(%rsi), %xmm2
2907 movdqu 0x30(%rsi), %xmm3
2908 lea 0x40(%rsi), %rsi
2909
2910 movaps %xmm0, (%rdi)
2911 movaps %xmm1, 0x10(%rdi)
2912 movaps %xmm2, 0x20(%rdi)
2913 movaps %xmm3, 0x30(%rdi)
2914 lea 0x40(%rdi), %rdi
2915 sub $0x40, %rdx
2916L(large_page_ll_less_fwd_64bytes):
2917 add %rdx, %rsi
2918 add %rdx, %rdi
2919 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2920
2921#endif
2922 .p2align 4
2923L(large_page_bwd):
2924 movdqu -0x10(%rsi), %xmm1
2925 lea -16(%rsi), %rsi
2926 movdqu %xmm0, (%r8)
2927 movdqa %xmm1, -0x10(%rdi)
2928 lea -16(%rdi), %rdi
2929 lea -0x90(%rdx), %rdx
2930#ifdef USE_AS_MEMMOVE
2931 mov %rdi, %r9
2932 sub %rsi, %r9
2933 cmp %rdx, %r9
2934 jae L(memmove_is_memcpy_bwd)
2935 cmp %rcx, %r9
2936 jb L(ll_cache_copy_bwd_start)
2937L(memmove_is_memcpy_bwd):
2938#endif
2939L(large_page_bwd_loop):
2940 movdqu -0x10(%rsi), %xmm0
2941 movdqu -0x20(%rsi), %xmm1
2942 movdqu -0x30(%rsi), %xmm2
2943 movdqu -0x40(%rsi), %xmm3
2944 movdqu -0x50(%rsi), %xmm4
2945 movdqu -0x60(%rsi), %xmm5
2946 movdqu -0x70(%rsi), %xmm6
2947 movdqu -0x80(%rsi), %xmm7
2948 lea -0x80(%rsi), %rsi
2949
2950 sub $0x80, %rdx
2951 movntdq %xmm0, -0x10(%rdi)
2952 movntdq %xmm1, -0x20(%rdi)
2953 movntdq %xmm2, -0x30(%rdi)
2954 movntdq %xmm3, -0x40(%rdi)
2955 movntdq %xmm4, -0x50(%rdi)
2956 movntdq %xmm5, -0x60(%rdi)
2957 movntdq %xmm6, -0x70(%rdi)
2958 movntdq %xmm7, -0x80(%rdi)
2959 lea -0x80(%rdi), %rdi
2960 jae L(large_page_bwd_loop)
2961 cmp $-0x40, %rdx
2962 lea 0x80(%rdx), %rdx
2963 jl L(large_page_less_bwd_64bytes)
2964
2965 movdqu -0x10(%rsi), %xmm0
2966 movdqu -0x20(%rsi), %xmm1
2967 movdqu -0x30(%rsi), %xmm2
2968 movdqu -0x40(%rsi), %xmm3
2969 lea -0x40(%rsi), %rsi
2970
2971 movntdq %xmm0, -0x10(%rdi)
2972 movntdq %xmm1, -0x20(%rdi)
2973 movntdq %xmm2, -0x30(%rdi)
2974 movntdq %xmm3, -0x40(%rdi)
2975 lea -0x40(%rdi), %rdi
2976 sub $0x40, %rdx
2977L(large_page_less_bwd_64bytes):
2978 sfence
2979 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2980
2981#ifdef USE_AS_MEMMOVE
2982 .p2align 4
2983L(ll_cache_copy_bwd_start):
2984 prefetcht0 -0x1c0(%rsi)
2985 prefetcht0 -0x200(%rsi)
2986 movdqu -0x10(%rsi), %xmm0
2987 movdqu -0x20(%rsi), %xmm1
2988 movdqu -0x30(%rsi), %xmm2
2989 movdqu -0x40(%rsi), %xmm3
2990 movdqu -0x50(%rsi), %xmm4
2991 movdqu -0x60(%rsi), %xmm5
2992 movdqu -0x70(%rsi), %xmm6
2993 movdqu -0x80(%rsi), %xmm7
2994 lea -0x80(%rsi), %rsi
2995
2996 sub $0x80, %rdx
2997 movaps %xmm0, -0x10(%rdi)
2998 movaps %xmm1, -0x20(%rdi)
2999 movaps %xmm2, -0x30(%rdi)
3000 movaps %xmm3, -0x40(%rdi)
3001 movaps %xmm4, -0x50(%rdi)
3002 movaps %xmm5, -0x60(%rdi)
3003 movaps %xmm6, -0x70(%rdi)
3004 movaps %xmm7, -0x80(%rdi)
3005 lea -0x80(%rdi), %rdi
3006 jae L(ll_cache_copy_bwd_start)
3007 cmp $-0x40, %rdx
3008 lea 0x80(%rdx), %rdx
3009 jl L(large_page_ll_less_bwd_64bytes)
3010
3011 movdqu -0x10(%rsi), %xmm0
3012 movdqu -0x20(%rsi), %xmm1
3013 movdqu -0x30(%rsi), %xmm2
3014 movdqu -0x40(%rsi), %xmm3
3015 lea -0x40(%rsi), %rsi
3016
3017 movaps %xmm0, -0x10(%rdi)
3018 movaps %xmm1, -0x20(%rdi)
3019 movaps %xmm2, -0x30(%rdi)
3020 movaps %xmm3, -0x40(%rdi)
3021 lea -0x40(%rdi), %rdi
3022 sub $0x40, %rdx
3023L(large_page_ll_less_bwd_64bytes):
3024 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3025#endif
3026
3027END (MEMCPY)
3028
3029 .section .rodata.ssse3,"a",@progbits
3030 .p2align 3
3031L(table_less_80bytes):
3032 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3033 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3100 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3101 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3102 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3104 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3105 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3106 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3107 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3108 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3109 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3110 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3111 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3112
3113 .p2align 3
3114L(shl_table):
3115 .int JMPTBL (L(shl_0), L(shl_table))
3116 .int JMPTBL (L(shl_1), L(shl_table))
3117 .int JMPTBL (L(shl_2), L(shl_table))
3118 .int JMPTBL (L(shl_3), L(shl_table))
3119 .int JMPTBL (L(shl_4), L(shl_table))
3120 .int JMPTBL (L(shl_5), L(shl_table))
3121 .int JMPTBL (L(shl_6), L(shl_table))
3122 .int JMPTBL (L(shl_7), L(shl_table))
3123 .int JMPTBL (L(shl_8), L(shl_table))
3124 .int JMPTBL (L(shl_9), L(shl_table))
3125 .int JMPTBL (L(shl_10), L(shl_table))
3126 .int JMPTBL (L(shl_11), L(shl_table))
3127 .int JMPTBL (L(shl_12), L(shl_table))
3128 .int JMPTBL (L(shl_13), L(shl_table))
3129 .int JMPTBL (L(shl_14), L(shl_table))
3130 .int JMPTBL (L(shl_15), L(shl_table))
3131
3132 .p2align 3
3133L(shl_table_bwd):
3134 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3135 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3136 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3137 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3138 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3139 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3140 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3141 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3142 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3143 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3144 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3145 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3146 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3147 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3148 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3149 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
3150
3151#endif
3152

source code of glibc/sysdeps/x86_64/multiarch/memcpy-ssse3.S