memset-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S]

1	/ memset with unaligned store and rep stosb*
2	Copyright (C) 2016-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ memset is implemented as:*
20	1. Use overlapping store to avoid branch.
21	2. If size is less than VEC, use integer register stores.
22	3. If size is from VEC_SIZE to 2 VEC_SIZE, use 2 VEC stores.*
23	4. If size is from 2 VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.*
24	5. If size is more to 4 VEC_SIZE, align to 4 * VEC_SIZE with*
25	4 VEC stores and store 4 VEC at a time until done. /
26
27	#include <sysdep.h>
28
29	#ifndef MEMSET_CHK_SYMBOL
30	# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31	#endif
32
33	#ifndef WMEMSET_CHK_SYMBOL
34	# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35	#endif
36
37	#ifndef XMM0
38	# define XMM0 xmm0
39	#endif
40
41	#ifndef YMM0
42	# define YMM0 ymm0
43	#endif
44
45	#ifndef VZEROUPPER
46	# if VEC_SIZE > 16
47	# define VZEROUPPER vzeroupper
48	# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
49	# else
50	# define VZEROUPPER
51	# endif
52	#endif
53
54	#ifndef VZEROUPPER_SHORT_RETURN
55	# define VZEROUPPER_SHORT_RETURN rep; ret
56	#endif
57
58	#ifndef MOVQ
59	# if VEC_SIZE > 16
60	# define MOVQ vmovq
61	# define MOVD vmovd
62	# else
63	# define MOVQ movq
64	# define MOVD movd
65	# endif
66	#endif
67
68	#if VEC_SIZE == 64
69	# define LOOP_4X_OFFSET (VEC_SIZE * 4)
70	#else
71	# define LOOP_4X_OFFSET (0)
72	#endif
73
74	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
75	# define END_REG rcx
76	# define LOOP_REG rdi
77	# define LESS_VEC_REG rax
78	#else
79	# define END_REG rdi
80	# define LOOP_REG rdx
81	# define LESS_VEC_REG rdi
82	#endif
83
84	#ifdef USE_XMM_LESS_VEC
85	# define XMM_SMALL 1
86	#else
87	# define XMM_SMALL 0
88	#endif
89
90	#ifdef USE_LESS_VEC_MASK_STORE
91	# define SET_REG64 rcx
92	# define SET_REG32 ecx
93	# define SET_REG16 cx
94	# define SET_REG8 cl
95	#else
96	# define SET_REG64 rsi
97	# define SET_REG32 esi
98	# define SET_REG16 si
99	# define SET_REG8 sil
100	#endif
101
102	#define PAGE_SIZE 4096
103
104	/ Macro to calculate size of small memset block for aligning*
105	purposes. /*
106	#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1)
107
108
109	#ifndef SECTION
110	# error SECTION is not defined!
111	#endif
112
113	.section SECTION(.text), "ax", @progbits
114	#if IS_IN (libc)
115	# if defined SHARED
116	ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
117	cmp %RDX_LP, %RCX_LP
118	jb HIDDEN_JUMPTARGET (__chk_fail)
119	END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
120	# endif
121
122	ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
123	shl $`2`, %RDX_LP
124	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
125	WMEMSET_VDUP_TO_VEC0_LOW()
126	cmpq $VEC_SIZE, %rdx
127	jb L(less_vec_from_wmemset)
128	WMEMSET_VDUP_TO_VEC0_HIGH()
129	jmp L(entry_from_wmemset)
130	END (WMEMSET_SYMBOL (__wmemset, unaligned))
131	#endif
132
133	#if defined SHARED && IS_IN (libc)
134	ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
135	cmp %RDX_LP, %RCX_LP
136	jb HIDDEN_JUMPTARGET (__chk_fail)
137	END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
138	#endif
139
140	ENTRY (MEMSET_SYMBOL (__memset, unaligned))
141	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
142	# ifdef __ILP32__
143	/ Clear the upper 32 bits. /
144	mov %edx, %edx
145	# endif
146	cmpq $VEC_SIZE, %rdx
147	jb L(less_vec)
148	MEMSET_VDUP_TO_VEC0_HIGH()
149	L(entry_from_wmemset):
150	cmpq $(VEC_SIZE * `2`), %rdx
151	ja L(more_2x_vec)
152	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
153	VMOVU %VEC(`0`), -VEC_SIZE(%rdi,%rdx)
154	VMOVU %VEC(`0`), (%rdi)
155	VZEROUPPER_RETURN
156	#if defined USE_MULTIARCH && IS_IN (libc)
157	END (MEMSET_SYMBOL (__memset, unaligned))
158
159	# if defined SHARED && IS_IN (libc)
160	ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
161	cmp %RDX_LP, %RCX_LP
162	jb HIDDEN_JUMPTARGET (__chk_fail)
163	END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
164	# endif
165
166	ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), `6`)
167	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
168	# ifdef __ILP32__
169	/ Clear the upper 32 bits. /
170	mov %edx, %edx
171	# endif
172	cmp $VEC_SIZE, %RDX_LP
173	jb L(less_vec)
174	MEMSET_VDUP_TO_VEC0_HIGH ()
175	cmp $(VEC_SIZE * `2`), %RDX_LP
176	ja L(stosb_more_2x_vec)
177	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
178	VMOVU %VEC(`0`), (%rdi)
179	VMOVU %VEC(`0`), (VEC_SIZE * -`1`)(%rdi, %rdx)
180	VZEROUPPER_RETURN
181	#endif
182
183	.p2align `4`,, `4`
184	L(last_2x_vec):
185	#ifdef USE_LESS_VEC_MASK_STORE
186	VMOVU %VEC(`0`), (VEC_SIZE * -`2`)(%rdi, %rdx)
187	VMOVU %VEC(`0`), (VEC_SIZE * -`1`)(%rdi, %rdx)
188	#else
189	VMOVU %VEC(`0`), (VEC_SIZE * -`2`)(%rdi)
190	VMOVU %VEC(`0`), (VEC_SIZE * -`1`)(%rdi)
191	#endif
192	VZEROUPPER_RETURN
193
194	/ If have AVX512 mask instructions put L(less_vec) close to*
195	entry as it doesn't take much space and is likely a hot target.
196	*/
197	#ifdef USE_LESS_VEC_MASK_STORE
198	.p2align `4`,, `10`
199	L(less_vec):
200	L(less_vec_from_wmemset):
201	/ Less than 1 VEC. /
202	# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
203	# error Unsupported VEC_SIZE!
204	# endif
205	/ Clear high bits from edi. Only keeping bits relevant to page*
206	cross check. Note that we are using rax which is set in
207	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. /*
208	andl $(PAGE_SIZE - `1`), %edi
209	/ Check if VEC_SIZE store cross page. Mask stores suffer*
210	serious performance degradation when it has to fault supress.
211	*/
212	cmpl $(PAGE_SIZE - VEC_SIZE), %edi
213	/ This is generally considered a cold target. /
214	ja L(cross_page)
215	# if VEC_SIZE > 32
216	movq $-`1`, %rcx
217	bzhiq %rdx, %rcx, %rcx
218	kmovq %rcx, %k1
219	# else
220	movl $-`1`, %ecx
221	bzhil %edx, %ecx, %ecx
222	kmovd %ecx, %k1
223	# endif
224	vmovdqu8 %VEC(`0`), (%rax){%k1}
225	VZEROUPPER_RETURN
226
227	# if defined USE_MULTIARCH && IS_IN (libc)
228	/ Include L(stosb_local) here if including L(less_vec) between*
229	L(stosb_more_2x_vec) and ENTRY. This is to cache align the
230	L(stosb_more_2x_vec) target. /*
231	.p2align `4`,, `10`
232	L(stosb_local):
233	movzbl %sil, %eax
234	mov %RDX_LP, %RCX_LP
235	mov %RDI_LP, %RDX_LP
236	rep stosb
237	mov %RDX_LP, %RAX_LP
238	VZEROUPPER_RETURN
239	# endif
240	#endif
241
242	#if defined USE_MULTIARCH && IS_IN (libc)
243	.p2align `4`
244	L(stosb_more_2x_vec):
245	cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
246	ja L(stosb_local)
247	#endif
248	/ Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]*
249	and (4x, 8x] jump to target. /*
250	L(more_2x_vec):
251	/ Store next 2x vec regardless. /
252	VMOVU %VEC(`0`), (%rdi)
253	VMOVU %VEC(`0`), (VEC_SIZE * `1`)(%rdi)
254
255
256	/ Two different methods of setting up pointers / compare. The two*
257	methods are based on the fact that EVEX/AVX512 mov instructions take
258	more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
259	machines also have fast LEA_BID. Both setup and END_REG to avoid complex
260	address mode. For EVEX/AVX512 this saves code size and keeps a few
261	targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
262	bottlenecks. /*
263	#if !(defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512)
264	/ If AVX2/SSE2 compute END_REG (rdi) with ALU. /
265	addq %rdx, %END_REG
266	#endif
267
268	cmpq $(VEC_SIZE * `4`), %rdx
269	jbe L(last_2x_vec)
270
271
272	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
273	/ If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with*
274	LEA_BID. /*
275
276	/ END_REG is rcx for EVEX/AVX512. /
277	leaq -(VEC_SIZE * `4` + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
278	#endif
279
280	/ Store next 2x vec regardless. /
281	VMOVU %VEC(`0`), (VEC_SIZE * `2`)(%rax)
282	VMOVU %VEC(`0`), (VEC_SIZE * `3`)(%rax)
283
284
285	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
286	/ If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add*
287	extra offset to addresses in loop. Used for AVX512 to save space
288	as no way to get (VEC_SIZE 4) in imm8. /
289	# if LOOP_4X_OFFSET == 0
290	subq $-(VEC_SIZE * `4`), %LOOP_REG
291	# endif
292	/ Avoid imm32 compare here to save code size. /
293	cmpq %rdi, %rcx
294	#else
295	addq $-(VEC_SIZE * `4`), %END_REG
296	cmpq $(VEC_SIZE * `8`), %rdx
297	#endif
298	jbe L(last_4x_vec)
299	#if !(defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512)
300	/ Set LOOP_REG (rdx). /
301	leaq (VEC_SIZE * `4`)(%rax), %LOOP_REG
302	#endif
303	/ Align dst for loop. /
304	andq $(VEC_SIZE * -`2`), %LOOP_REG
305	.p2align `4`
306	L(loop):
307	VMOVA %VEC(`0`), LOOP_4X_OFFSET(%LOOP_REG)
308	VMOVA %VEC(`0`), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
309	VMOVA %VEC(`0`), (VEC_SIZE * `2` + LOOP_4X_OFFSET)(%LOOP_REG)
310	VMOVA %VEC(`0`), (VEC_SIZE * `3` + LOOP_4X_OFFSET)(%LOOP_REG)
311	subq $-(VEC_SIZE * `4`), %LOOP_REG
312	cmpq %END_REG, %LOOP_REG
313	jb L(loop)
314	.p2align `4`,, MOV_SIZE
315	L(last_4x_vec):
316	VMOVU %VEC(`0`), LOOP_4X_OFFSET(%END_REG)
317	VMOVU %VEC(`0`), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
318	VMOVU %VEC(`0`), (VEC_SIZE * `2` + LOOP_4X_OFFSET)(%END_REG)
319	VMOVU %VEC(`0`), (VEC_SIZE * `3` + LOOP_4X_OFFSET)(%END_REG)
320	L(return):
321	#if VEC_SIZE > 16
322	ZERO_UPPER_VEC_REGISTERS_RETURN
323	#else
324	ret
325	#endif
326
327	.p2align `4`,, `10`
328	#ifndef USE_LESS_VEC_MASK_STORE
329	# if defined USE_MULTIARCH && IS_IN (libc)
330	/ If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in*
331	range for 2-byte jump encoding. /*
332	L(stosb_local):
333	movzbl %sil, %eax
334	mov %RDX_LP, %RCX_LP
335	mov %RDI_LP, %RDX_LP
336	rep stosb
337	mov %RDX_LP, %RAX_LP
338	VZEROUPPER_RETURN
339	# endif
340	/ Define L(less_vec) only if not otherwise defined. /
341	.p2align `4`
342	L(less_vec):
343	/ Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to*
344	xmm). This is only does anything for AVX2. /*
345	MEMSET_VDUP_TO_VEC0_LOW ()
346	L(less_vec_from_wmemset):
347	#endif
348	L(cross_page):
349	#if VEC_SIZE > 32
350	cmpl $`32`, %edx
351	jge L(between_32_63)
352	#endif
353	#if VEC_SIZE > 16
354	cmpl $`16`, %edx
355	jge L(between_16_31)
356	#endif
357	#ifndef USE_XMM_LESS_VEC
358	MOVQ %XMM0, %SET_REG64
359	#endif
360	cmpl $`8`, %edx
361	jge L(between_8_15)
362	cmpl $`4`, %edx
363	jge L(between_4_7)
364	cmpl $`1`, %edx
365	jg L(between_2_3)
366	jl L(between_0_0)
367	movb %SET_REG8, (%LESS_VEC_REG)
368	L(between_0_0):
369	ret
370
371	/ Align small targets only if not doing so would cross a fetch line.*
372	*/
373	#if VEC_SIZE > 32
374	.p2align `4`,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
375	/ From 32 to 63. No branch when size == 32. /
376	L(between_32_63):
377	VMOVU %YMM0, (%LESS_VEC_REG)
378	VMOVU %YMM0, -`32`(%LESS_VEC_REG, %rdx)
379	VZEROUPPER_RETURN
380	#endif
381
382	#if VEC_SIZE >= 32
383	.p2align `4`,, SMALL_MEMSET_ALIGN(MOV_SIZE, `1`)
384	L(between_16_31):
385	/ From 16 to 31. No branch when size == 16. /
386	VMOVU %XMM0, (%LESS_VEC_REG)
387	VMOVU %XMM0, -`16`(%LESS_VEC_REG, %rdx)
388	ret
389	#endif
390
391	/ Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.*
392	*/
393	.p2align `4`,, SMALL_MEMSET_ALIGN(`3` + XMM_SMALL, `1`)
394	L(between_8_15):
395	/ From 8 to 15. No branch when size == 8. /
396	#ifdef USE_XMM_LESS_VEC
397	MOVQ %XMM0, (%rdi)
398	MOVQ %XMM0, -`8`(%rdi, %rdx)
399	#else
400	movq %SET_REG64, (%LESS_VEC_REG)
401	movq %SET_REG64, -`8`(%LESS_VEC_REG, %rdx)
402	#endif
403	ret
404
405	/ Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.*
406	*/
407	.p2align `4`,, SMALL_MEMSET_ALIGN(`2` << XMM_SMALL, `1`)
408	L(between_4_7):
409	/ From 4 to 7. No branch when size == 4. /
410	#ifdef USE_XMM_LESS_VEC
411	MOVD %XMM0, (%rdi)
412	MOVD %XMM0, -`4`(%rdi, %rdx)
413	#else
414	movl %SET_REG32, (%LESS_VEC_REG)
415	movl %SET_REG32, -`4`(%LESS_VEC_REG, %rdx)
416	#endif
417	ret
418
419	/ 4 * XMM_SMALL for the third mov for AVX2. /
420	.p2align `4`,, `4` * XMM_SMALL + SMALL_MEMSET_ALIGN(`3`, `1`)
421	L(between_2_3):
422	/ From 2 to 3. No branch when size == 2. /
423	#ifdef USE_XMM_LESS_VEC
424	movb %SET_REG8, (%rdi)
425	movb %SET_REG8, `1`(%rdi)
426	movb %SET_REG8, -`1`(%rdi, %rdx)
427	#else
428	movw %SET_REG16, (%LESS_VEC_REG)
429	movb %SET_REG8, -`1`(%LESS_VEC_REG, %rdx)
430	#endif
431	ret
432	END (MEMSET_SYMBOL (__memset, unaligned_erms))
433

source code of glibc/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S