allocatestack.c source code [glibc/nptl/allocatestack.c]

1	/ Copyright (C) 2002-2024 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3
4	The GNU C Library is free software; you can redistribute it and/or
5	modify it under the terms of the GNU Lesser General Public
6	License as published by the Free Software Foundation; either
7	version 2.1 of the License, or (at your option) any later version.
8
9	The GNU C Library is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	Lesser General Public License for more details.
13
14	You should have received a copy of the GNU Lesser General Public
15	License along with the GNU C Library; if not, see
16	<https://www.gnu.org/licenses/>. /*
17
18	#include <assert.h>
19	#include <errno.h>
20	#include <signal.h>
21	#include <stdint.h>
22	#include <string.h>
23	#include <unistd.h>
24	#include <sys/mman.h>
25	#include <sys/param.h>
26	#include <dl-sysdep.h>
27	#include <dl-tls.h>
28	#include <tls.h>
29	#include <list.h>
30	#include <lowlevellock.h>
31	#include <futex-internal.h>
32	#include <kernel-features.h>
33	#include <nptl-stack.h>
34	#include <libc-lock.h>
35	#include <tls-internal.h>
36	#include <intprops.h>
37	#include <setvmaname.h>
38
39	/ Default alignment of stack. /
40	#ifndef STACK_ALIGN
41	# define STACK_ALIGN __alignof__ (long double)
42	#endif
43
44	/ Default value for minimal stack size after allocating thread*
45	descriptor and guard. /*
46	#ifndef MINIMAL_REST_STACK
47	# define MINIMAL_REST_STACK 4096
48	#endif
49
50
51	/ Newer kernels have the MAP_STACK flag to indicate a mapping is used for*
52	a stack. Use it when possible. /*
53	#ifndef MAP_STACK
54	# define MAP_STACK 0
55	#endif
56
57	/ Get a stack frame from the cache. We have to match by size since*
58	some blocks might be too small or far too large. /*
59	static struct pthread *
60	get_cached_stack (size_t sizep, void* **memp)
61	{
62	size_t size = *sizep;
63	struct pthread *result = NULL;
64	list_t *entry;
65
66	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
67
68	/ Search the cache for a matching entry. We search for the*
69	smallest stack which has at least the required size. Note that
70	in normal situations the size of all allocated stacks is the
71	same. As the very least there are only a few different sizes.
72	Therefore this loop will exit early most of the time with an
73	exact match. /*
74	list_for_each (entry, &GL (dl_stack_cache))
75	{
76	struct pthread *curr;
77
78	curr = list_entry (entry, struct pthread, list);
79	if (__nptl_stack_in_use (pd: curr) && curr->stackblock_size >= size)
80	{
81	if (curr->stackblock_size == size)
82	{
83	result = curr;
84	break;
85	}
86
87	if (result == NULL
88	\|\| result->stackblock_size > curr->stackblock_size)
89	result = curr;
90	}
91	}
92
93	if (__builtin_expect (result == NULL, `0`)
94	/ Make sure the size difference is not too excessive. In that*
95	case we do not use the block. /*
96	\|\| __builtin_expect (result->stackblock_size > `4` * size, `0`))
97	{
98	/ Release the lock. /
99	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
100
101	return NULL;
102	}
103
104	/ Don't allow setxid until cloned. /
105	result->setxid_futex = -`1`;
106
107	/ Dequeue the entry. /
108	__nptl_stack_list_del (&result->list);
109
110	/ And add to the list of stacks in use. /
111	__nptl_stack_list_add (&result->list, &GL (dl_stack_used));
112
113	/ And decrease the cache size. /
114	GL (dl_stack_cache_actsize) -= result->stackblock_size;
115
116	/ Release the lock early. /
117	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
118
119	/ Report size and location of the stack to the caller. /
120	*sizep = result->stackblock_size;
121	*memp = result->stackblock;
122
123	/ Cancellation handling is back to the default. /
124	result->cancelhandling = `0`;
125	result->cleanup = NULL;
126	result->setup_failed = `0`;
127
128	/ No pending event. /
129	result->nextevent = NULL;
130
131	result->exiting = false;
132	__libc_lock_init (result->exit_lock);
133	memset (&result->tls_state, `0`, sizeof result->tls_state);
134
135	/ Clear the DTV. /
136	dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
137	for (size_t cnt = `0`; cnt < dtv[-`1`].counter; ++cnt)
138	free (ptr: dtv[`1` + cnt].pointer.to_free);
139	memset (dtv, `'\0'`, (dtv[-`1`].counter + `1`) * sizeof (dtv_t));
140
141	/ Re-initialize the TLS. /
142	_dl_allocate_tls_init (TLS_TPADJ (result), true);
143
144	return result;
145	}
146
147	/ Return the guard page position on allocated stack. /
148	static inline char *
149	__attribute ((always_inline))
150	guard_position (void mem, size_t size, size_t guardsize, struct* pthread *pd,
151	size_t pagesize_m1)
152	{
153	#if _STACK_GROWS_DOWN
154	return mem;
155	#elif _STACK_GROWS_UP
156	return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
157	#endif
158	}
159
160	/ Based on stack allocated with PROT_NONE, setup the required portions with*
161	'prot' flags based on the guard page position. /*
162	static inline int
163	setup_stack_prot (char mem, size_t size, char* *guard, size_t guardsize,
164	const int prot)
165	{
166	char *guardend = guard + guardsize;
167	#if _STACK_GROWS_DOWN
168	/ As defined at guard_position, for architectures with downward stack*
169	the guard page is always at start of the allocated area. /*
170	if (__mprotect (guardend, size - guardsize, prot) != `0`)
171	return errno;
172	#else
173	size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
174	if (__mprotect (mem, mprots1, prot) != `0`)
175	return errno;
176	size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
177	if (__mprotect (guardend, mprots2, prot) != `0`)
178	return errno;
179	#endif
180	return `0`;
181	}
182
183	/ Mark the memory of the stack as usable to the kernel. It frees everything*
184	except for the space used for the TCB itself. /*
185	static __always_inline void
186	advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
187	{
188	uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
189	size_t pagesize_m1 = __getpagesize () - `1`;
190	#if _STACK_GROWS_DOWN
191	size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
192	assert (freesize < size);
193	if (freesize > PTHREAD_STACK_MIN)
194	__madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
195	#else
196	/ Page aligned start of memory to free (higher than or equal*
197	to current sp plus the minimum stack size). /*
198	uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
199	uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
200	if (free_end > freeblock)
201	{
202	size_t freesize = free_end - freeblock;
203	assert (freesize < size);
204	__madvise ((void*) freeblock, freesize, MADV_DONTNEED);
205	}
206	#endif
207	}
208
209	/ Returns a usable stack for a new thread either by allocating a*
210	new stack or reusing a cached stack of sufficient size.
211	ATTR must be non-NULL and point to a valid pthread_attr.
212	PDP must be non-NULL. /*
213	static int
214	allocate_stack (const struct pthread_attr attr, struct* pthread **pdp,
215	void *stack, size_t stacksize)
216	{
217	struct pthread *pd;
218	size_t size;
219	size_t pagesize_m1 = __getpagesize () - `1`;
220	size_t tls_static_size_for_stack = __nptl_tls_static_size_for_stack ();
221	size_t tls_static_align_m1 = GLRO (dl_tls_static_align) - `1`;
222
223	assert (powerof2 (pagesize_m1 + `1`));
224	assert (TCB_ALIGNMENT >= STACK_ALIGN);
225
226	/ Get the stack size from the attribute if it is set. Otherwise we*
227	use the default we determined at start time. /*
228	if (attr->stacksize != `0`)
229	size = attr->stacksize;
230	else
231	{
232	lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
233	size = __default_pthread_attr.internal.stacksize;
234	lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
235	}
236
237	/ Get memory for the stack. /
238	if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
239	{
240	uintptr_t adj;
241	char stackaddr = (char* *) attr->stackaddr;
242
243	/ Assume the same layout as the _STACK_GROWS_DOWN case, with struct*
244	pthread at the top of the stack block. Later we adjust the guard
245	location and stack address to match the _STACK_GROWS_UP case. /*
246	if (_STACK_GROWS_UP)
247	stackaddr += attr->stacksize;
248
249	/ If the user also specified the size of the stack make sure it*
250	is large enough. /*
251	if (attr->stacksize != `0`
252	&& attr->stacksize < (tls_static_size_for_stack
253	+ MINIMAL_REST_STACK))
254	return EINVAL;
255
256	/ Adjust stack size for alignment of the TLS block. /
257	#if TLS_TCB_AT_TP
258	adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
259	& tls_static_align_m1;
260	assert (size > adj + TLS_TCB_SIZE);
261	#elif TLS_DTV_AT_TP
262	adj = ((uintptr_t) stackaddr - tls_static_size_for_stack)
263	& tls_static_align_m1;
264	assert (size > adj);
265	#endif
266
267	/ The user provided some memory. Let's hope it matches the*
268	size... We do not allocate guard pages if the user provided
269	the stack. It is the user's responsibility to do this if it
270	is wanted. /*
271	#if TLS_TCB_AT_TP
272	pd = (struct pthread *) ((uintptr_t) stackaddr
273	- TLS_TCB_SIZE - adj);
274	#elif TLS_DTV_AT_TP
275	pd = (struct pthread *) (((uintptr_t) stackaddr
276	- tls_static_size_for_stack - adj)
277	- TLS_PRE_TCB_SIZE);
278	#endif
279
280	/ The user provided stack memory needs to be cleared. /
281	memset (pd, `'\0'`, sizeof (struct pthread));
282
283	/ The first TSD block is included in the TCB. /
284	pd->specific[`0`] = pd->specific_1stblock;
285
286	/ Remember the stack-related values. /
287	pd->stackblock = (char *) stackaddr - size;
288	pd->stackblock_size = size;
289
290	/ This is a user-provided stack. It will not be queued in the*
291	stack cache nor will the memory (except the TLS memory) be freed. /*
292	pd->user_stack = true;
293
294	/ This is at least the second thread. /
295	pd->header.multiple_threads = `1`;
296
297	#ifdef NEED_DL_SYSINFO
298	SETUP_THREAD_SYSINFO (pd);
299	#endif
300
301	/ Don't allow setxid until cloned. /
302	pd->setxid_futex = -`1`;
303
304	/ Allocate the DTV for this thread. /
305	if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
306	{
307	/ Something went wrong. /
308	assert (errno == ENOMEM);
309	return errno;
310	}
311
312
313	/ Prepare to modify global data. /
314	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
315
316	/ And add to the list of stacks in use. /
317	list_add (newp: &pd->list, head: &GL (dl_stack_user));
318
319	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
320	}
321	else
322	{
323	/ Allocate some anonymous memory. If possible use the cache. /
324	size_t guardsize;
325	size_t reported_guardsize;
326	size_t reqsize;
327	void *mem;
328	const int prot = (PROT_READ \| PROT_WRITE
329	\| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : `0`));
330
331	/ Adjust the stack size for alignment. /
332	size &= ~tls_static_align_m1;
333	assert (size != `0`);
334
335	/ Make sure the size of the stack is enough for the guard and*
336	eventually the thread descriptor. On some targets there is
337	a minimum guard size requirement, ARCH_MIN_GUARD_SIZE, so
338	internally enforce it (unless the guard was disabled), but
339	report the original guard size for backward compatibility:
340	before POSIX 2008 the guardsize was specified to be one page
341	by default which is observable via pthread_attr_getguardsize
342	and pthread_getattr_np. /*
343	guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
344	reported_guardsize = guardsize;
345	if (guardsize > `0` && guardsize < ARCH_MIN_GUARD_SIZE)
346	guardsize = ARCH_MIN_GUARD_SIZE;
347	if (guardsize < attr->guardsize \|\| size + guardsize < guardsize)
348	/ Arithmetic overflow. /
349	return EINVAL;
350	size += guardsize;
351	if (__builtin_expect (size < ((guardsize + tls_static_size_for_stack
352	+ MINIMAL_REST_STACK + pagesize_m1)
353	& ~pagesize_m1),
354	`0`))
355	/ The stack is too small (or the guard too large). /
356	return EINVAL;
357
358	/ Try to get a stack from the cache. /
359	reqsize = size;
360	pd = get_cached_stack (sizep: &size, memp: &mem);
361	if (pd == NULL)
362	{
363	/ If a guard page is required, avoid committing memory by first*
364	allocate with PROT_NONE and then reserve with required permission
365	excluding the guard page. /*
366	mem = __mmap (NULL, size, (guardsize == `0`) ? prot : PROT_NONE,
367	MAP_PRIVATE \| MAP_ANONYMOUS \| MAP_STACK, -`1`, `0`);
368
369	if (__glibc_unlikely (mem == MAP_FAILED))
370	return errno;
371
372	/ Do madvise in case the tunable glibc.pthread.stack_hugetlb is*
373	set to 0, disabling hugetlb. /*
374	if (__glibc_unlikely (__nptl_stack_hugetlb == `0`)
375	&& __madvise (mem, size, MADV_NOHUGEPAGE) != `0`)
376	return errno;
377
378	/ SIZE is guaranteed to be greater than zero.*
379	So we can never get a null pointer back from mmap. /*
380	assert (mem != NULL);
381
382	/ Place the thread descriptor at the end of the stack. /
383	#if TLS_TCB_AT_TP
384	pd = (struct pthread *) ((((uintptr_t) mem + size)
385	- TLS_TCB_SIZE)
386	& ~tls_static_align_m1);
387	#elif TLS_DTV_AT_TP
388	pd = (struct pthread *) ((((uintptr_t) mem + size
389	- tls_static_size_for_stack)
390	& ~tls_static_align_m1)
391	- TLS_PRE_TCB_SIZE);
392	#endif
393
394	/ Now mprotect the required region excluding the guard area. /
395	if (__glibc_likely (guardsize > `0`))
396	{
397	char *guard = guard_position (mem, size, guardsize, pd,
398	pagesize_m1);
399	if (setup_stack_prot (mem, size, guard, guardsize, prot) != `0`)
400	{
401	__munmap (mem, size);
402	return errno;
403	}
404	}
405
406	/ Remember the stack-related values. /
407	pd->stackblock = mem;
408	pd->stackblock_size = size;
409	/ Update guardsize for newly allocated guardsize to avoid*
410	an mprotect in guard resize below. /*
411	pd->guardsize = guardsize;
412
413	/ We allocated the first block thread-specific data array.*
414	This address will not change for the lifetime of this
415	descriptor. /*
416	pd->specific[`0`] = pd->specific_1stblock;
417
418	/ This is at least the second thread. /
419	pd->header.multiple_threads = `1`;
420
421	#ifdef NEED_DL_SYSINFO
422	SETUP_THREAD_SYSINFO (pd);
423	#endif
424
425	/ Don't allow setxid until cloned. /
426	pd->setxid_futex = -`1`;
427
428	/ Allocate the DTV for this thread. /
429	if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
430	{
431	/ Something went wrong. /
432	assert (errno == ENOMEM);
433
434	/ Free the stack memory we just allocated. /
435	(void) __munmap (mem, size);
436
437	return errno;
438	}
439
440
441	/ Prepare to modify global data. /
442	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
443
444	/ And add to the list of stacks in use. /
445	__nptl_stack_list_add (&pd->list, &GL (dl_stack_used));
446
447	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
448
449
450	/ There might have been a race. Another thread might have*
451	caused the stacks to get exec permission while this new
452	stack was prepared. Detect if this was possible and
453	change the permission if necessary. /*
454	if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != `0`
455	&& (prot & PROT_EXEC) == `0`, `0`))
456	{
457	int err = __nptl_change_stack_perm (pd);
458	if (err != `0`)
459	{
460	/ Free the stack memory we just allocated. /
461	(void) __munmap (mem, size);
462
463	return err;
464	}
465	}
466
467
468	/ Note that all of the stack and the thread descriptor is*
469	zeroed. This means we do not have to initialize fields
470	with initial value zero. This is specifically true for
471	the 'tid' field which is always set back to zero once the
472	stack is not used anymore and for the 'guardsize' field
473	which will be read next. /*
474	}
475
476	/ Create or resize the guard area if necessary. /
477	if (__glibc_unlikely (guardsize > pd->guardsize))
478	{
479	char *guard = guard_position (mem, size, guardsize, pd,
480	pagesize_m1);
481	if (__mprotect (guard, guardsize, PROT_NONE) != `0`)
482	{
483	mprot_error:
484	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
485
486	/ Remove the thread from the list. /
487	__nptl_stack_list_del (&pd->list);
488
489	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
490
491	/ Get rid of the TLS block we allocated. /
492	_dl_deallocate_tls (TLS_TPADJ (pd), false);
493
494	/ Free the stack memory regardless of whether the size*
495	of the cache is over the limit or not. If this piece
496	of memory caused problems we better do not use it
497	anymore. Uh, and we ignore possible errors. There
498	is nothing we could do. /*
499	(void) __munmap (mem, size);
500
501	return errno;
502	}
503
504	pd->guardsize = guardsize;
505	}
506	else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
507	`0`))
508	{
509	/ The old guard area is too large. /
510
511	#if _STACK_GROWS_DOWN
512	if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
513	prot) != `0`)
514	goto mprot_error;
515	#elif _STACK_GROWS_UP
516	char new_guard = (char* *)(((uintptr_t) pd - guardsize)
517	& ~pagesize_m1);
518	char old_guard = (char* *)(((uintptr_t) pd - pd->guardsize)
519	& ~pagesize_m1);
520	/ The guard size difference might be > 0, but once rounded*
521	to the nearest page the size difference might be zero. /*
522	if (new_guard > old_guard
523	&& __mprotect (old_guard, new_guard - old_guard, prot) != `0`)
524	goto mprot_error;
525	#endif
526
527	pd->guardsize = guardsize;
528	}
529	/ The pthread_getattr_np() calls need to get passed the size*
530	requested in the attribute, regardless of how large the
531	actually used guardsize is. /*
532	pd->reported_guardsize = reported_guardsize;
533	}
534
535	/ Initialize the lock. We have to do this unconditionally since the*
536	stillborn thread could be canceled while the lock is taken. /*
537	pd->lock = LLL_LOCK_INITIALIZER;
538
539	/ The robust mutex lists also need to be initialized*
540	unconditionally because the cleanup for the previous stack owner
541	might have happened in the kernel. /*
542	pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
543	- offsetof (pthread_mutex_t,
544	__data.__list.__next));
545	pd->robust_head.list_op_pending = NULL;
546	#if __PTHREAD_MUTEX_HAVE_PREV
547	pd->robust_prev = &pd->robust_head;
548	#endif
549	pd->robust_head.list = &pd->robust_head;
550
551	/ We place the thread descriptor at the end of the stack. /
552	*pdp = pd;
553
554	void *stacktop;
555
556	#if TLS_TCB_AT_TP
557	/ The stack begins before the TCB and the static TLS block. /
558	stacktop = ((char *) (pd + `1`) - tls_static_size_for_stack);
559	#elif TLS_DTV_AT_TP
560	stacktop = (char *) (pd - `1`);
561	#endif
562
563	*stacksize = stacktop - pd->stackblock;
564	*stack = pd->stackblock;
565
566	return `0`;
567	}
568
569	/ Maximum supported name from initial kernel support, not exported*
570	by user API. /*
571	#define ANON_VMA_NAME_MAX_LEN 80
572
573	#define SET_STACK_NAME(__prefix, __stack, __stacksize, __tid) \
574	({ \
575	char __stack_name[sizeof (__prefix) + \
576	INT_BUFSIZE_BOUND (unsigned int)]; \
577	_Static_assert (sizeof __stack_name <= ANON_VMA_NAME_MAX_LEN, \
578	"VMA name size larger than maximum supported"); \
579	__snprintf (__stack_name, sizeof (__stack_name), __prefix "%u", \
580	(unsigned int) __tid); \
581	__set_vma_name (__stack, __stacksize, __stack_name); \
582	})
583
584	/ Add or remove an associated name to the PD VMA stack. /
585	static void
586	name_stack_maps (struct pthread *pd, bool set)
587	{
588	#if _STACK_GROWS_DOWN
589	void *stack = pd->stackblock + pd->guardsize;
590	#else
591	void *stack = pd->stackblock;
592	#endif
593	size_t stacksize = pd->stackblock_size - pd->guardsize;
594
595	if (!set)
596	__set_vma_name (start: stack, len: stacksize, NULL);
597	else
598	{
599	unsigned int tid = pd->tid;
600	if (pd->user_stack)
601	SET_STACK_NAME (" glibc: pthread user stack: ", stack, stacksize, tid);
602	else
603	SET_STACK_NAME (" glibc: pthread stack: ", stack, stacksize, tid);
604	}
605	}
606

source code of glibc/nptl/allocatestack.c